Initial community commit
This commit is contained in:
parent
537bcbc862
commit
fc06254474
16440 changed files with 4239995 additions and 2 deletions
353
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/arm2gnu.pl
vendored
Normal file
353
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/arm2gnu.pl
vendored
Normal file
|
@ -0,0 +1,353 @@
|
|||
#!/usr/bin/perl
|
||||
# Copyright (C) 2002-2013 Xiph.org Foundation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
my $bigend; # little/big endian
|
||||
my $nxstack;
|
||||
my $apple = 0;
|
||||
my $symprefix = "";
|
||||
|
||||
$nxstack = 0;
|
||||
|
||||
eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
|
||||
if $running_under_some_shell;
|
||||
|
||||
while ($ARGV[0] =~ /^-/) {
|
||||
$_ = shift;
|
||||
last if /^--$/;
|
||||
if (/^-n$/) {
|
||||
$nflag++;
|
||||
next;
|
||||
}
|
||||
if (/^--apple$/) {
|
||||
$apple = 1;
|
||||
$symprefix = "_";
|
||||
next;
|
||||
}
|
||||
die "I don't recognize this switch: $_\\n";
|
||||
}
|
||||
$printit++ unless $nflag;
|
||||
|
||||
$\ = "\n"; # automatically add newline on print
|
||||
$n=0;
|
||||
|
||||
$thumb = 0; # ARM mode by default, not Thumb.
|
||||
@proc_stack = ();
|
||||
|
||||
printf (" .syntax unified\n");
|
||||
|
||||
LINE:
|
||||
while (<>) {
|
||||
|
||||
# For ADRLs we need to add a new line after the substituted one.
|
||||
$addPadding = 0;
|
||||
|
||||
# First, we do not dare to touch *anything* inside double quotes, do we?
|
||||
# Second, if you want a dollar character in the string,
|
||||
# insert two of them -- that's how ARM C and assembler treat strings.
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
|
||||
s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
|
||||
s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
|
||||
# If there's nothing on a line but a comment, don't try to apply any further
|
||||
# substitutions (this is a cheap hack to avoid mucking up the license header)
|
||||
s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
|
||||
# If substituted -- leave immediately !
|
||||
|
||||
s/@/,:/;
|
||||
s/;/@/;
|
||||
while ( /@.*'/ ) {
|
||||
s/(@.*)'/$1/g;
|
||||
}
|
||||
s/\{FALSE\}/0/g;
|
||||
s/\{TRUE\}/1/g;
|
||||
s/\{(\w\w\w\w+)\}/$1/g;
|
||||
s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
|
||||
s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
|
||||
s/\bIMPORT\b/.extern/;
|
||||
s/\bEXPORT\b\s*/.global $symprefix/;
|
||||
s/^(\s+)\[/$1IF/;
|
||||
s/^(\s+)\|/$1ELSE/;
|
||||
s/^(\s+)\]/$1ENDIF/;
|
||||
s/IF *:DEF:/ .ifdef/;
|
||||
s/IF *:LNOT: *:DEF:/ .ifndef/;
|
||||
s/ELSE/ .else/;
|
||||
s/ENDIF/ .endif/;
|
||||
|
||||
if( /\bIF\b/ ) {
|
||||
s/\bIF\b/ .if/;
|
||||
s/=/==/;
|
||||
}
|
||||
if ( $n == 2) {
|
||||
s/\$/\\/g;
|
||||
}
|
||||
if ($n == 1) {
|
||||
s/\$//g;
|
||||
s/label//g;
|
||||
$n = 2;
|
||||
}
|
||||
if ( /MACRO/ ) {
|
||||
s/MACRO *\n/.macro/;
|
||||
$n=1;
|
||||
}
|
||||
if ( /\bMEND\b/ ) {
|
||||
s/\bMEND\b/.endm/;
|
||||
$n=0;
|
||||
}
|
||||
|
||||
# ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
|
||||
#
|
||||
if ( /\bAREA\b/ ) {
|
||||
my $align;
|
||||
$align = "2";
|
||||
if ( /ALIGN=(\d+)/ ) {
|
||||
$align = $1;
|
||||
}
|
||||
if ( /CODE/ ) {
|
||||
$nxstack = 1;
|
||||
}
|
||||
s/^(.+)CODE(.+)READONLY(.*)/ .text/;
|
||||
s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/;
|
||||
s/^(.+)\|\|\.data\|\|(.+)/ .data/;
|
||||
s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
|
||||
s/$/; .p2align $align/;
|
||||
# Enable NEON instructions but don't produce a binary that requires
|
||||
# ARMv7. RVCT does not have equivalent directives, so we just do this
|
||||
# for all CODE areas.
|
||||
if ( /.text/ ) {
|
||||
# Separating .arch, .fpu, etc., by semicolons does not work (gas
|
||||
# thinks the semicolon is part of the arch name, even when there's
|
||||
# whitespace separating them). Sadly this means our line numbers
|
||||
# won't match the original source file (we could use the .line
|
||||
# directive, which is documented to be obsolete, but then gdb will
|
||||
# show the wrong line in the translated source file).
|
||||
s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/ unless ($apple);
|
||||
}
|
||||
}
|
||||
|
||||
s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
|
||||
s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
|
||||
s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
|
||||
s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
|
||||
s/^(\s+)\%(\s)/ .space $1/;
|
||||
|
||||
s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
|
||||
s/\bCODE32\b/.code 32/ && do {$thumb = 0};
|
||||
s/\bCODE16\b/.code 16/ && do {$thumb = 1};
|
||||
if (/\bPROC\b/)
|
||||
{
|
||||
my $prefix;
|
||||
my $proc;
|
||||
/^([A-Za-z_\.]\w+)\b/;
|
||||
$proc = $1;
|
||||
$prefix = "";
|
||||
if ($proc)
|
||||
{
|
||||
$prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
|
||||
# Make sure we $prefix isn't empty here (for the $apple case).
|
||||
# We handle mangling the label here, make sure it doesn't match
|
||||
# the label handling below (if $prefix would be empty).
|
||||
$prefix = $prefix."; ";
|
||||
push(@proc_stack, $proc);
|
||||
s/^[A-Za-z_\.]\w+/$symprefix$&:/;
|
||||
}
|
||||
$prefix = $prefix."\t.thumb_func; " if ($thumb);
|
||||
s/\bPROC\b/@ $&/;
|
||||
$_ = $prefix.$_;
|
||||
}
|
||||
s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
|
||||
s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
|
||||
if (/\bENDP\b/)
|
||||
{
|
||||
my $proc;
|
||||
s/\bENDP\b/@ $&/;
|
||||
$proc = pop(@proc_stack);
|
||||
$_ = "\t.size $proc, .-$proc".$_ if ($proc && !$apple);
|
||||
}
|
||||
s/\bSUBT\b/@ $&/;
|
||||
s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
|
||||
s/\bKEEP\b/@ $&/;
|
||||
s/\bEXPORTAS\b/@ $&/;
|
||||
s/\|\|(.)+\bEQU\b/@ $&/;
|
||||
s/\|\|([\w\$]+)\|\|/$1/;
|
||||
s/\bENTRY\b/@ $&/;
|
||||
s/\bASSERT\b/@ $&/;
|
||||
s/\bGBLL\b/@ $&/;
|
||||
s/\bGBLA\b/@ $&/;
|
||||
s/^\W+OPT\b/@ $&/;
|
||||
s/:OR:/|/g;
|
||||
s/:SHL:/<</g;
|
||||
s/:SHR:/>>/g;
|
||||
s/:AND:/&/g;
|
||||
s/:LAND:/&&/g;
|
||||
s/CPSR/cpsr/;
|
||||
s/SPSR/spsr/;
|
||||
s/ALIGN$/.balign 4/;
|
||||
s/ALIGN\s+([0-9x]+)$/.balign $1/;
|
||||
s/psr_cxsf/psr_all/;
|
||||
s/LTORG/.ltorg/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
|
||||
|
||||
# {PC} + 0xdeadfeed --> . + 0xdeadfeed
|
||||
s/\{PC\} \+/ \. +/;
|
||||
|
||||
# Single hex constant on the line !
|
||||
#
|
||||
# >>> NOTE <<<
|
||||
# Double-precision floats in gcc are always mixed-endian, which means
|
||||
# bytes in two words are little-endian, but words are big-endian.
|
||||
# So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
|
||||
# and 0xfeed0000 at high address.
|
||||
#
|
||||
s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
|
||||
# Only decimal constants on the line, no hex !
|
||||
s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
|
||||
|
||||
# Single hex constant on the line !
|
||||
# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
|
||||
# Only decimal constants on the line, no hex !
|
||||
# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
|
||||
s/\bDCFS[ \t]+0x/.word 0x/;
|
||||
s/\bDCFS\b/.float/;
|
||||
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
|
||||
s/\bDCD\b/.word/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
|
||||
s/\bDCW\b/.short/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
|
||||
s/\bDCB\b/.byte/;
|
||||
s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
|
||||
s/^[A-Za-z_\.]\w+/$&:/;
|
||||
s/^(\d+)/$1:/;
|
||||
s/\%(\d+)/$1b_or_f/;
|
||||
s/\%[Bb](\d+)/$1b/;
|
||||
s/\%[Ff](\d+)/$1f/;
|
||||
s/\%[Ff][Tt](\d+)/$1f/;
|
||||
s/&([\dA-Fa-f]+)/0x$1/;
|
||||
if ( /\b2_[01]+\b/ ) {
|
||||
s/\b2_([01]+)\b/conv$1&&&&/g;
|
||||
while ( /[01][01][01][01]&&&&/ ) {
|
||||
s/0000&&&&/&&&&0/g;
|
||||
s/0001&&&&/&&&&1/g;
|
||||
s/0010&&&&/&&&&2/g;
|
||||
s/0011&&&&/&&&&3/g;
|
||||
s/0100&&&&/&&&&4/g;
|
||||
s/0101&&&&/&&&&5/g;
|
||||
s/0110&&&&/&&&&6/g;
|
||||
s/0111&&&&/&&&&7/g;
|
||||
s/1000&&&&/&&&&8/g;
|
||||
s/1001&&&&/&&&&9/g;
|
||||
s/1010&&&&/&&&&A/g;
|
||||
s/1011&&&&/&&&&B/g;
|
||||
s/1100&&&&/&&&&C/g;
|
||||
s/1101&&&&/&&&&D/g;
|
||||
s/1110&&&&/&&&&E/g;
|
||||
s/1111&&&&/&&&&F/g;
|
||||
}
|
||||
s/000&&&&/&&&&0/g;
|
||||
s/001&&&&/&&&&1/g;
|
||||
s/010&&&&/&&&&2/g;
|
||||
s/011&&&&/&&&&3/g;
|
||||
s/100&&&&/&&&&4/g;
|
||||
s/101&&&&/&&&&5/g;
|
||||
s/110&&&&/&&&&6/g;
|
||||
s/111&&&&/&&&&7/g;
|
||||
s/00&&&&/&&&&0/g;
|
||||
s/01&&&&/&&&&1/g;
|
||||
s/10&&&&/&&&&2/g;
|
||||
s/11&&&&/&&&&3/g;
|
||||
s/0&&&&/&&&&0/g;
|
||||
s/1&&&&/&&&&1/g;
|
||||
s/conv&&&&/0x/g;
|
||||
}
|
||||
|
||||
if ( /commandline/)
|
||||
{
|
||||
if( /-bigend/)
|
||||
{
|
||||
$bigend=1;
|
||||
}
|
||||
}
|
||||
|
||||
if ( /\bDCDU\b/ )
|
||||
{
|
||||
my $cmd=$_;
|
||||
my $value;
|
||||
my $prefix;
|
||||
my $w1;
|
||||
my $w2;
|
||||
my $w3;
|
||||
my $w4;
|
||||
|
||||
s/\s+DCDU\b/@ $&/;
|
||||
|
||||
$cmd =~ /\bDCDU\b\s+0x(\d+)/;
|
||||
$value = $1;
|
||||
$value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
|
||||
$w1 = $1;
|
||||
$w2 = $2;
|
||||
$w3 = $3;
|
||||
$w4 = $4;
|
||||
|
||||
if( $bigend ne "")
|
||||
{
|
||||
# big endian
|
||||
$prefix = "\t.byte\t0x".$w1.";".
|
||||
"\t.byte\t0x".$w2.";".
|
||||
"\t.byte\t0x".$w3.";".
|
||||
"\t.byte\t0x".$w4."; ";
|
||||
}
|
||||
else
|
||||
{
|
||||
# little endian
|
||||
$prefix = "\t.byte\t0x".$w4.";".
|
||||
"\t.byte\t0x".$w3.";".
|
||||
"\t.byte\t0x".$w2.";".
|
||||
"\t.byte\t0x".$w1."; ";
|
||||
}
|
||||
$_=$prefix.$_;
|
||||
}
|
||||
|
||||
if ( /\badrl\b/i )
|
||||
{
|
||||
s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
|
||||
$addPadding = 1;
|
||||
}
|
||||
s/\bEND\b/@ END/;
|
||||
} continue {
|
||||
printf ("%s", $_) if $printit;
|
||||
if ($addPadding != 0)
|
||||
{
|
||||
printf (" mov r0,r0\n");
|
||||
$addPadding = 0;
|
||||
}
|
||||
}
|
||||
#If we had a code section, mark that this object doesn't need an executable
|
||||
# stack.
|
||||
if ($nxstack && !$apple) {
|
||||
printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n");
|
||||
}
|
160
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/arm_celt_map.c
vendored
Normal file
160
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/arm_celt_map.c
vendored
Normal file
|
@ -0,0 +1,160 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation
|
||||
* Copyright (c) 2013 Parrot */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pitch.h"
|
||||
#include "kiss_fft.h"
|
||||
#include "mdct.h"
|
||||
|
||||
#if defined(OPUS_HAVE_RTCD)
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
|
||||
celt_inner_prod_c, /* ARMv4 */
|
||||
celt_inner_prod_c, /* EDSP */
|
||||
celt_inner_prod_c, /* Media */
|
||||
celt_inner_prod_neon /* NEON */
|
||||
};
|
||||
|
||||
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
||||
int N, opus_val32 *xy1, opus_val32 *xy2) = {
|
||||
dual_inner_prod_c, /* ARMv4 */
|
||||
dual_inner_prod_c, /* EDSP */
|
||||
dual_inner_prod_c, /* Media */
|
||||
dual_inner_prod_neon /* NEON */
|
||||
};
|
||||
# endif
|
||||
|
||||
# if defined(FIXED_POINT)
|
||||
# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
|
||||
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
|
||||
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
|
||||
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
|
||||
const opus_val16 *, opus_val32 *, int, int, int) = {
|
||||
celt_pitch_xcorr_c, /* ARMv4 */
|
||||
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
|
||||
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
|
||||
MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
|
||||
};
|
||||
|
||||
# endif
|
||||
# else /* !FIXED_POINT */
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
|
||||
const opus_val16 *, opus_val32 *, int, int, int) = {
|
||||
celt_pitch_xcorr_c, /* ARMv4 */
|
||||
celt_pitch_xcorr_c, /* EDSP */
|
||||
celt_pitch_xcorr_c, /* Media */
|
||||
celt_pitch_xcorr_float_neon /* Neon */
|
||||
};
|
||||
# endif
|
||||
# endif /* FIXED_POINT */
|
||||
|
||||
#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
|
||||
defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
|
||||
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
|
||||
const opus_val16 *x,
|
||||
const opus_val16 *y,
|
||||
opus_val32 sum[4],
|
||||
int len
|
||||
) = {
|
||||
xcorr_kernel_c, /* ARMv4 */
|
||||
xcorr_kernel_c, /* EDSP */
|
||||
xcorr_kernel_c, /* Media */
|
||||
xcorr_kernel_neon_fixed, /* Neon */
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
# if defined(HAVE_ARM_NE10)
|
||||
# if defined(CUSTOM_MODES)
|
||||
int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
|
||||
opus_fft_alloc_arch_c, /* ARMv4 */
|
||||
opus_fft_alloc_arch_c, /* EDSP */
|
||||
opus_fft_alloc_arch_c, /* Media */
|
||||
opus_fft_alloc_arm_neon /* Neon with NE10 library support */
|
||||
};
|
||||
|
||||
void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
|
||||
opus_fft_free_arch_c, /* ARMv4 */
|
||||
opus_fft_free_arch_c, /* EDSP */
|
||||
opus_fft_free_arch_c, /* Media */
|
||||
opus_fft_free_arm_neon /* Neon with NE10 */
|
||||
};
|
||||
# endif /* CUSTOM_MODES */
|
||||
|
||||
void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout) = {
|
||||
opus_fft_c, /* ARMv4 */
|
||||
opus_fft_c, /* EDSP */
|
||||
opus_fft_c, /* Media */
|
||||
opus_fft_neon /* Neon with NE10 */
|
||||
};
|
||||
|
||||
void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout) = {
|
||||
opus_ifft_c, /* ARMv4 */
|
||||
opus_ifft_c, /* EDSP */
|
||||
opus_ifft_c, /* Media */
|
||||
opus_ifft_neon /* Neon with NE10 */
|
||||
};
|
||||
|
||||
void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
|
||||
kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 *window,
|
||||
int overlap, int shift,
|
||||
int stride, int arch) = {
|
||||
clt_mdct_forward_c, /* ARMv4 */
|
||||
clt_mdct_forward_c, /* EDSP */
|
||||
clt_mdct_forward_c, /* Media */
|
||||
clt_mdct_forward_neon /* Neon with NE10 */
|
||||
};
|
||||
|
||||
void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
|
||||
kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 *window,
|
||||
int overlap, int shift,
|
||||
int stride, int arch) = {
|
||||
clt_mdct_backward_c, /* ARMv4 */
|
||||
clt_mdct_backward_c, /* EDSP */
|
||||
clt_mdct_backward_c, /* Media */
|
||||
clt_mdct_backward_neon /* Neon with NE10 */
|
||||
};
|
||||
|
||||
# endif /* HAVE_ARM_NE10 */
|
||||
# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
|
||||
|
||||
#endif /* OPUS_HAVE_RTCD */
|
185
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armcpu.c
vendored
Normal file
185
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armcpu.c
vendored
Normal file
|
@ -0,0 +1,185 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation
|
||||
* Copyright (c) 2013 Parrot */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Original code from libtheora modified to suit to Opus */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#ifdef OPUS_HAVE_RTCD
|
||||
|
||||
#include "armcpu.h"
|
||||
#include "cpu_support.h"
|
||||
#include "os_support.h"
|
||||
#include "opus_types.h"
|
||||
#include "arch.h"
|
||||
|
||||
#define OPUS_CPU_ARM_V4_FLAG (1<<OPUS_ARCH_ARM_V4)
|
||||
#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
|
||||
#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
|
||||
#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# define WIN32_EXTRA_LEAN
|
||||
# include <windows.h>
|
||||
|
||||
static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
|
||||
opus_uint32 flags;
|
||||
flags=0;
|
||||
/* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
|
||||
* instructions via their assembled hex code.
|
||||
* All of these instructions should be essentially nops. */
|
||||
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|
||||
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
__try{
|
||||
/*PLD [r13]*/
|
||||
__emit(0xF5DDF000);
|
||||
flags|=OPUS_CPU_ARM_EDSP_FLAG;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|
||||
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
__try{
|
||||
/*SHADD8 r3,r3,r3*/
|
||||
__emit(0xE6333F93);
|
||||
flags|=OPUS_CPU_ARM_MEDIA_FLAG;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
__try{
|
||||
/*VORR q0,q0,q0*/
|
||||
__emit(0xF2200150);
|
||||
flags|=OPUS_CPU_ARM_NEON_FLAG;
|
||||
}
|
||||
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
return flags;
|
||||
}
|
||||
|
||||
#elif defined(__linux__)
|
||||
/* Linux based */
|
||||
opus_uint32 opus_cpu_capabilities(void)
|
||||
{
|
||||
opus_uint32 flags = 0;
|
||||
FILE *cpuinfo;
|
||||
|
||||
/* Reading /proc/self/auxv would be easier, but that doesn't work reliably on
|
||||
* Android */
|
||||
cpuinfo = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
if(cpuinfo != NULL)
|
||||
{
|
||||
/* 512 should be enough for anybody (it's even enough for all the flags that
|
||||
* x86 has accumulated... so far). */
|
||||
char buf[512];
|
||||
|
||||
while(fgets(buf, 512, cpuinfo) != NULL)
|
||||
{
|
||||
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|
||||
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
/* Search for edsp and neon flag */
|
||||
if(memcmp(buf, "Features", 8) == 0)
|
||||
{
|
||||
char *p;
|
||||
p = strstr(buf, " edsp");
|
||||
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
|
||||
flags |= OPUS_CPU_ARM_EDSP_FLAG;
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
p = strstr(buf, " neon");
|
||||
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
|
||||
flags |= OPUS_CPU_ARM_NEON_FLAG;
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|
||||
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
/* Search for media capabilities (>= ARMv6) */
|
||||
if(memcmp(buf, "CPU architecture:", 17) == 0)
|
||||
{
|
||||
int version;
|
||||
version = atoi(buf+17);
|
||||
|
||||
if(version >= 6)
|
||||
flags |= OPUS_CPU_ARM_MEDIA_FLAG;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
fclose(cpuinfo);
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#else
|
||||
/* The feature registers which can tell us what the processor supports are
|
||||
* accessible in priveleged modes only, so we can't have a general user-space
|
||||
* detection method like on x86.*/
|
||||
# error "Configured to use ARM asm but no CPU detection method available for " \
|
||||
"your platform. Reconfigure with --disable-rtcd (or send patches)."
|
||||
#endif
|
||||
|
||||
int opus_select_arch(void)
|
||||
{
|
||||
opus_uint32 flags = opus_cpu_capabilities();
|
||||
int arch = 0;
|
||||
|
||||
if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
|
||||
/* Asserts ensure arch values are sequential */
|
||||
celt_assert(arch == OPUS_ARCH_ARM_V4);
|
||||
return arch;
|
||||
}
|
||||
arch++;
|
||||
|
||||
if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
|
||||
celt_assert(arch == OPUS_ARCH_ARM_EDSP);
|
||||
return arch;
|
||||
}
|
||||
arch++;
|
||||
|
||||
if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
|
||||
celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
|
||||
return arch;
|
||||
}
|
||||
arch++;
|
||||
|
||||
celt_assert(arch == OPUS_ARCH_ARM_NEON);
|
||||
return arch;
|
||||
}
|
||||
|
||||
#endif
|
77
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armcpu.h
vendored
Normal file
77
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armcpu.h
vendored
Normal file
|
@ -0,0 +1,77 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation
|
||||
* Copyright (c) 2013 Parrot */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if !defined(ARMCPU_H)
|
||||
# define ARMCPU_H
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
|
||||
# define MAY_HAVE_EDSP(name) name ## _edsp
|
||||
# else
|
||||
# define MAY_HAVE_EDSP(name) name ## _c
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
|
||||
# define MAY_HAVE_MEDIA(name) name ## _media
|
||||
# else
|
||||
# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON)
|
||||
# define MAY_HAVE_NEON(name) name ## _neon
|
||||
# else
|
||||
# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_PRESUME_EDSP)
|
||||
# define PRESUME_EDSP(name) name ## _edsp
|
||||
# else
|
||||
# define PRESUME_EDSP(name) name ## _c
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_PRESUME_MEDIA)
|
||||
# define PRESUME_MEDIA(name) name ## _media
|
||||
# else
|
||||
# define PRESUME_MEDIA(name) PRESUME_EDSP(name)
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_PRESUME_NEON)
|
||||
# define PRESUME_NEON(name) name ## _neon
|
||||
# else
|
||||
# define PRESUME_NEON(name) PRESUME_MEDIA(name)
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_HAVE_RTCD)
|
||||
int opus_select_arch(void);
|
||||
|
||||
#define OPUS_ARCH_ARM_V4 (0)
|
||||
#define OPUS_ARCH_ARM_EDSP (1)
|
||||
#define OPUS_ARCH_ARM_MEDIA (2)
|
||||
#define OPUS_ARCH_ARM_NEON (3)
|
||||
|
||||
# endif
|
||||
|
||||
#endif
|
37
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armopts.s.in
vendored
Normal file
37
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/armopts.s.in
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/* Copyright (C) 2013 Mozilla Corporation */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
; Set the following to 1 if we have EDSP instructions
|
||||
; (LDRD/STRD, etc., ARMv5E and later).
|
||||
OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@
|
||||
|
||||
; Set the following to 1 if we have ARMv6 media instructions.
|
||||
OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
|
||||
|
||||
; Set the following to 1 if we have NEON (some ARMv7)
|
||||
OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@
|
||||
|
||||
END
|
173
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_fft_ne10.c
vendored
Normal file
173
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_fft_ne10.c
vendored
Normal file
|
@ -0,0 +1,173 @@
|
|||
/* Copyright (c) 2015 Xiph.Org Foundation
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file celt_fft_ne10.c
|
||||
@brief ARM Neon optimizations for fft using NE10 library
|
||||
*/
|
||||
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SKIP_CONFIG_H
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <NE10_dsp.h>
|
||||
#include "os_support.h"
|
||||
#include "kiss_fft.h"
|
||||
#include "stack_alloc.h"
|
||||
|
||||
#if !defined(FIXED_POINT)
|
||||
# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
|
||||
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
|
||||
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
|
||||
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
|
||||
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
|
||||
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
|
||||
#else
|
||||
# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
|
||||
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
|
||||
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
|
||||
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
|
||||
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
|
||||
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
|
||||
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
|
||||
#endif
|
||||
|
||||
#if defined(CUSTOM_MODES)
|
||||
|
||||
/* nfft lengths in NE10 that support scaled fft */
|
||||
# define NE10_FFTSCALED_SUPPORT_MAX 4
|
||||
static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
|
||||
480, 240, 120, 60
|
||||
};
|
||||
|
||||
int opus_fft_alloc_arm_neon(kiss_fft_state *st)
|
||||
{
|
||||
int i;
|
||||
size_t memneeded = sizeof(struct arch_fft_state);
|
||||
|
||||
st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
|
||||
if (!st->arch_fft)
|
||||
return -1;
|
||||
|
||||
for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
|
||||
if(st->nfft == ne10_fft_scaled_support[i])
|
||||
break;
|
||||
}
|
||||
if (i == NE10_FFTSCALED_SUPPORT_MAX) {
|
||||
/* This nfft length (scaled fft) is not supported in NE10 */
|
||||
st->arch_fft->is_supported = 0;
|
||||
st->arch_fft->priv = NULL;
|
||||
}
|
||||
else {
|
||||
st->arch_fft->is_supported = 1;
|
||||
st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
|
||||
if (st->arch_fft->priv == NULL) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void opus_fft_free_arm_neon(kiss_fft_state *st)
|
||||
{
|
||||
NE10_FFT_CFG_TYPE_T cfg;
|
||||
|
||||
if (!st->arch_fft)
|
||||
return;
|
||||
|
||||
cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
|
||||
if (cfg)
|
||||
NE10_FFT_DESTROY_C2C_TYPE(cfg);
|
||||
opus_free(st->arch_fft);
|
||||
}
|
||||
#endif
|
||||
|
||||
void opus_fft_neon(const kiss_fft_state *st,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout)
|
||||
{
|
||||
NE10_FFT_STATE_TYPE_T state;
|
||||
NE10_FFT_CFG_TYPE_T cfg = &state;
|
||||
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
|
||||
SAVE_STACK;
|
||||
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
|
||||
|
||||
if (!st->arch_fft->is_supported) {
|
||||
/* This nfft length (scaled fft) not supported in NE10 */
|
||||
opus_fft_c(st, fin, fout);
|
||||
}
|
||||
else {
|
||||
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
|
||||
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
|
||||
#if !defined(FIXED_POINT)
|
||||
state.is_forward_scaled = 1;
|
||||
|
||||
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
|
||||
(NE10_FFT_CPX_TYPE_T *)fin,
|
||||
cfg, 0);
|
||||
#else
|
||||
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
|
||||
(NE10_FFT_CPX_TYPE_T *)fin,
|
||||
cfg, 0, 1);
|
||||
#endif
|
||||
}
|
||||
RESTORE_STACK;
|
||||
}
|
||||
|
||||
void opus_ifft_neon(const kiss_fft_state *st,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout)
|
||||
{
|
||||
NE10_FFT_STATE_TYPE_T state;
|
||||
NE10_FFT_CFG_TYPE_T cfg = &state;
|
||||
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
|
||||
SAVE_STACK;
|
||||
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
|
||||
|
||||
if (!st->arch_fft->is_supported) {
|
||||
/* This nfft length (scaled fft) not supported in NE10 */
|
||||
opus_ifft_c(st, fin, fout);
|
||||
}
|
||||
else {
|
||||
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
|
||||
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
|
||||
#if !defined(FIXED_POINT)
|
||||
state.is_backward_scaled = 0;
|
||||
|
||||
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
|
||||
(NE10_FFT_CPX_TYPE_T *)fin,
|
||||
cfg, 1);
|
||||
#else
|
||||
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
|
||||
(NE10_FFT_CPX_TYPE_T *)fin,
|
||||
cfg, 1, 0);
|
||||
#endif
|
||||
}
|
||||
RESTORE_STACK;
|
||||
}
|
258
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_mdct_ne10.c
vendored
Normal file
258
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_mdct_ne10.c
vendored
Normal file
|
@ -0,0 +1,258 @@
|
|||
/* Copyright (c) 2015 Xiph.Org Foundation
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file celt_mdct_ne10.c
|
||||
@brief ARM Neon optimizations for mdct using NE10 library
|
||||
*/
|
||||
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SKIP_CONFIG_H
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "kiss_fft.h"
|
||||
#include "_kiss_fft_guts.h"
|
||||
#include "mdct.h"
|
||||
#include "stack_alloc.h"
|
||||
|
||||
void clt_mdct_forward_neon(const mdct_lookup *l,
|
||||
kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 *window,
|
||||
int overlap, int shift, int stride, int arch)
|
||||
{
|
||||
int i;
|
||||
int N, N2, N4;
|
||||
VARDECL(kiss_fft_scalar, f);
|
||||
VARDECL(kiss_fft_cpx, f2);
|
||||
const kiss_fft_state *st = l->kfft[shift];
|
||||
const kiss_twiddle_scalar *trig;
|
||||
|
||||
SAVE_STACK;
|
||||
|
||||
N = l->n;
|
||||
trig = l->trig;
|
||||
for (i=0;i<shift;i++)
|
||||
{
|
||||
N >>= 1;
|
||||
trig += N;
|
||||
}
|
||||
N2 = N>>1;
|
||||
N4 = N>>2;
|
||||
|
||||
ALLOC(f, N2, kiss_fft_scalar);
|
||||
ALLOC(f2, N4, kiss_fft_cpx);
|
||||
|
||||
/* Consider the input to be composed of four blocks: [a, b, c, d] */
|
||||
/* Window, shuffle, fold */
|
||||
{
|
||||
/* Temp pointers to make it really clear to the compiler what we're doing */
|
||||
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
|
||||
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp = f;
|
||||
const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
|
||||
const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
|
||||
for(i=0;i<((overlap+3)>>2);i++)
|
||||
{
|
||||
/* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
|
||||
*yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
|
||||
*yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
|
||||
xp1+=2;
|
||||
xp2-=2;
|
||||
wp1+=2;
|
||||
wp2-=2;
|
||||
}
|
||||
wp1 = window;
|
||||
wp2 = window+overlap-1;
|
||||
for(;i<N4-((overlap+3)>>2);i++)
|
||||
{
|
||||
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
|
||||
*yp++ = *xp2;
|
||||
*yp++ = *xp1;
|
||||
xp1+=2;
|
||||
xp2-=2;
|
||||
}
|
||||
for(;i<N4;i++)
|
||||
{
|
||||
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
|
||||
*yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
|
||||
*yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
|
||||
xp1+=2;
|
||||
xp2-=2;
|
||||
wp1+=2;
|
||||
wp2-=2;
|
||||
}
|
||||
}
|
||||
/* Pre-rotation */
|
||||
{
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp = f;
|
||||
const kiss_twiddle_scalar *t = &trig[0];
|
||||
for(i=0;i<N4;i++)
|
||||
{
|
||||
kiss_fft_cpx yc;
|
||||
kiss_twiddle_scalar t0, t1;
|
||||
kiss_fft_scalar re, im, yr, yi;
|
||||
t0 = t[i];
|
||||
t1 = t[N4+i];
|
||||
re = *yp++;
|
||||
im = *yp++;
|
||||
yr = S_MUL(re,t0) - S_MUL(im,t1);
|
||||
yi = S_MUL(im,t0) + S_MUL(re,t1);
|
||||
yc.r = yr;
|
||||
yc.i = yi;
|
||||
f2[i] = yc;
|
||||
}
|
||||
}
|
||||
|
||||
opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
|
||||
|
||||
/* Post-rotate */
|
||||
{
|
||||
/* Temp pointers to make it really clear to the compiler what we're doing */
|
||||
const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
|
||||
const kiss_twiddle_scalar *t = &trig[0];
|
||||
/* Temp pointers to make it really clear to the compiler what we're doing */
|
||||
for(i=0;i<N4;i++)
|
||||
{
|
||||
kiss_fft_scalar yr, yi;
|
||||
yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
|
||||
yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
|
||||
*yp1 = yr;
|
||||
*yp2 = yi;
|
||||
fp++;
|
||||
yp1 += 2*stride;
|
||||
yp2 -= 2*stride;
|
||||
}
|
||||
}
|
||||
RESTORE_STACK;
|
||||
}
|
||||
|
||||
void clt_mdct_backward_neon(const mdct_lookup *l,
|
||||
kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 * OPUS_RESTRICT window,
|
||||
int overlap, int shift, int stride, int arch)
|
||||
{
|
||||
int i;
|
||||
int N, N2, N4;
|
||||
VARDECL(kiss_fft_scalar, f);
|
||||
const kiss_twiddle_scalar *trig;
|
||||
const kiss_fft_state *st = l->kfft[shift];
|
||||
|
||||
N = l->n;
|
||||
trig = l->trig;
|
||||
for (i=0;i<shift;i++)
|
||||
{
|
||||
N >>= 1;
|
||||
trig += N;
|
||||
}
|
||||
N2 = N>>1;
|
||||
N4 = N>>2;
|
||||
|
||||
ALLOC(f, N2, kiss_fft_scalar);
|
||||
|
||||
/* Pre-rotate */
|
||||
{
|
||||
/* Temp pointers to make it really clear to the compiler what we're doing */
|
||||
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
|
||||
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp = f;
|
||||
const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
|
||||
for(i=0;i<N4;i++)
|
||||
{
|
||||
kiss_fft_scalar yr, yi;
|
||||
yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
|
||||
yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
|
||||
yp[2*i] = yr;
|
||||
yp[2*i+1] = yi;
|
||||
xp1+=2*stride;
|
||||
xp2-=2*stride;
|
||||
}
|
||||
}
|
||||
|
||||
opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
|
||||
|
||||
/* Post-rotate and de-shuffle from both ends of the buffer at once to make
|
||||
it in-place. */
|
||||
{
|
||||
kiss_fft_scalar * yp0 = out+(overlap>>1);
|
||||
kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
|
||||
const kiss_twiddle_scalar *t = &trig[0];
|
||||
/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
|
||||
middle pair will be computed twice. */
|
||||
for(i=0;i<(N4+1)>>1;i++)
|
||||
{
|
||||
kiss_fft_scalar re, im, yr, yi;
|
||||
kiss_twiddle_scalar t0, t1;
|
||||
re = yp0[0];
|
||||
im = yp0[1];
|
||||
t0 = t[i];
|
||||
t1 = t[N4+i];
|
||||
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
|
||||
yr = S_MUL(re,t0) + S_MUL(im,t1);
|
||||
yi = S_MUL(re,t1) - S_MUL(im,t0);
|
||||
re = yp1[0];
|
||||
im = yp1[1];
|
||||
yp0[0] = yr;
|
||||
yp1[1] = yi;
|
||||
|
||||
t0 = t[(N4-i-1)];
|
||||
t1 = t[(N2-i-1)];
|
||||
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
|
||||
yr = S_MUL(re,t0) + S_MUL(im,t1);
|
||||
yi = S_MUL(re,t1) - S_MUL(im,t0);
|
||||
yp1[0] = yr;
|
||||
yp0[1] = yi;
|
||||
yp0 += 2;
|
||||
yp1 -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mirror on both sides for TDAC */
|
||||
{
|
||||
kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
|
||||
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
|
||||
const opus_val16 * OPUS_RESTRICT wp1 = window;
|
||||
const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
|
||||
|
||||
for(i = 0; i < overlap/2; i++)
|
||||
{
|
||||
kiss_fft_scalar x1, x2;
|
||||
x1 = *xp1;
|
||||
x2 = *yp1;
|
||||
*yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
|
||||
*xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
|
||||
wp1++;
|
||||
wp2--;
|
||||
}
|
||||
}
|
||||
RESTORE_STACK;
|
||||
}
|
211
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_neon_intr.c
vendored
Normal file
211
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_neon_intr.c
vendored
Normal file
|
@ -0,0 +1,211 @@
|
|||
/* Copyright (c) 2014-2015 Xiph.Org Foundation
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file celt_neon_intr.c
|
||||
@brief ARM Neon Intrinsic optimizations for celt
|
||||
*/
|
||||
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "../pitch.h"
|
||||
|
||||
#if defined(FIXED_POINT)
|
||||
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
|
||||
{
|
||||
int j;
|
||||
int32x4_t a = vld1q_s32(sum);
|
||||
/* Load y[0...3] */
|
||||
/* This requires len>0 to always be valid (which we assert in the C code). */
|
||||
int16x4_t y0 = vld1_s16(y);
|
||||
y += 4;
|
||||
|
||||
for (j = 0; j + 8 <= len; j += 8)
|
||||
{
|
||||
/* Load x[0...7] */
|
||||
int16x8_t xx = vld1q_s16(x);
|
||||
int16x4_t x0 = vget_low_s16(xx);
|
||||
int16x4_t x4 = vget_high_s16(xx);
|
||||
/* Load y[4...11] */
|
||||
int16x8_t yy = vld1q_s16(y);
|
||||
int16x4_t y4 = vget_low_s16(yy);
|
||||
int16x4_t y8 = vget_high_s16(yy);
|
||||
int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
|
||||
int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
|
||||
|
||||
int16x4_t y1 = vext_s16(y0, y4, 1);
|
||||
int16x4_t y5 = vext_s16(y4, y8, 1);
|
||||
int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
|
||||
int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
|
||||
|
||||
int16x4_t y2 = vext_s16(y0, y4, 2);
|
||||
int16x4_t y6 = vext_s16(y4, y8, 2);
|
||||
int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
|
||||
int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
|
||||
|
||||
int16x4_t y3 = vext_s16(y0, y4, 3);
|
||||
int16x4_t y7 = vext_s16(y4, y8, 3);
|
||||
int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
|
||||
int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
|
||||
|
||||
y0 = y8;
|
||||
a = a7;
|
||||
x += 8;
|
||||
y += 8;
|
||||
}
|
||||
|
||||
for (; j < len; j++)
|
||||
{
|
||||
int16x4_t x0 = vld1_dup_s16(x); /* load next x */
|
||||
int32x4_t a0 = vmlal_s16(a, y0, x0);
|
||||
|
||||
int16x4_t y4 = vld1_dup_s16(y); /* load next y */
|
||||
y0 = vext_s16(y0, y4, 1);
|
||||
a = a0;
|
||||
x++;
|
||||
y++;
|
||||
}
|
||||
|
||||
vst1q_s32(sum, a);
|
||||
}
|
||||
|
||||
#else
|
||||
/*
|
||||
* Function: xcorr_kernel_neon_float
|
||||
* ---------------------------------
|
||||
* Computes 4 correlation values and stores them in sum[4]
|
||||
*/
|
||||
static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
|
||||
float32_t sum[4], int len) {
|
||||
float32x4_t YY[3];
|
||||
float32x4_t YEXT[3];
|
||||
float32x4_t XX[2];
|
||||
float32x2_t XX_2;
|
||||
float32x4_t SUMM;
|
||||
const float32_t *xi = x;
|
||||
const float32_t *yi = y;
|
||||
|
||||
celt_assert(len>0);
|
||||
|
||||
YY[0] = vld1q_f32(yi);
|
||||
SUMM = vdupq_n_f32(0);
|
||||
|
||||
/* Consume 8 elements in x vector and 12 elements in y
|
||||
* vector. However, the 12'th element never really gets
|
||||
* touched in this loop. So, if len == 8, then we only
|
||||
* must access y[0] to y[10]. y[11] must not be accessed
|
||||
* hence make sure len > 8 and not len >= 8
|
||||
*/
|
||||
while (len > 8) {
|
||||
yi += 4;
|
||||
YY[1] = vld1q_f32(yi);
|
||||
yi += 4;
|
||||
YY[2] = vld1q_f32(yi);
|
||||
|
||||
XX[0] = vld1q_f32(xi);
|
||||
xi += 4;
|
||||
XX[1] = vld1q_f32(xi);
|
||||
xi += 4;
|
||||
|
||||
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
|
||||
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
|
||||
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
|
||||
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
|
||||
|
||||
SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
|
||||
YEXT[0] = vextq_f32(YY[1], YY[2], 1);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
|
||||
YEXT[1] = vextq_f32(YY[1], YY[2], 2);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
|
||||
YEXT[2] = vextq_f32(YY[1], YY[2], 3);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
|
||||
|
||||
YY[0] = YY[2];
|
||||
len -= 8;
|
||||
}
|
||||
|
||||
/* Consume 4 elements in x vector and 8 elements in y
|
||||
* vector. However, the 8'th element in y never really gets
|
||||
* touched in this loop. So, if len == 4, then we only
|
||||
* must access y[0] to y[6]. y[7] must not be accessed
|
||||
* hence make sure len>4 and not len>=4
|
||||
*/
|
||||
if (len > 4) {
|
||||
yi += 4;
|
||||
YY[1] = vld1q_f32(yi);
|
||||
|
||||
XX[0] = vld1q_f32(xi);
|
||||
xi += 4;
|
||||
|
||||
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
|
||||
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
|
||||
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
|
||||
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
|
||||
|
||||
YY[0] = YY[1];
|
||||
len -= 4;
|
||||
}
|
||||
|
||||
while (--len > 0) {
|
||||
XX_2 = vld1_dup_f32(xi++);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
|
||||
YY[0]= vld1q_f32(++yi);
|
||||
}
|
||||
|
||||
XX_2 = vld1_dup_f32(xi);
|
||||
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
|
||||
|
||||
vst1q_f32(sum, SUMM);
|
||||
}
|
||||
|
||||
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
|
||||
opus_val32 *xcorr, int len, int max_pitch, int arch) {
|
||||
int i;
|
||||
(void)arch;
|
||||
celt_assert(max_pitch > 0);
|
||||
celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
|
||||
|
||||
for (i = 0; i < (max_pitch-3); i += 4) {
|
||||
xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
|
||||
(float32_t *)xcorr+i, len);
|
||||
}
|
||||
|
||||
/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
|
||||
for (; i < max_pitch; i++) {
|
||||
xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
|
||||
}
|
||||
}
|
||||
#endif
|
555
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
vendored
Normal file
555
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
vendored
Normal file
|
@ -0,0 +1,555 @@
|
|||
.syntax unified
|
||||
@ Copyright (c) 2007-2008 CSIRO
|
||||
@ Copyright (c) 2007-2009 Xiph.Org Foundation
|
||||
@ Copyright (c) 2013 Parrot
|
||||
@ Written by Aurélien Zanelli
|
||||
@
|
||||
@ Redistribution and use in source and binary forms, with or without
|
||||
@ modification, are permitted provided that the following conditions
|
||||
@ are met:
|
||||
@
|
||||
@ - Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@
|
||||
@ - Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.text; .p2align 2; .arch armv7-a
|
||||
.fpu neon
|
||||
.object_arch armv4t
|
||||
|
||||
.include "celt/arm/armopts-gnu.S"
|
||||
|
||||
.if OPUS_ARM_MAY_HAVE_EDSP
|
||||
.global celt_pitch_xcorr_edsp
|
||||
.endif
|
||||
|
||||
.if OPUS_ARM_MAY_HAVE_NEON
|
||||
.global celt_pitch_xcorr_neon
|
||||
.endif
|
||||
|
||||
.if OPUS_ARM_MAY_HAVE_NEON
|
||||
|
||||
@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
|
||||
.type xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
|
||||
xcorr_kernel_neon_start:
|
||||
@ input:
|
||||
@ r3 = int len
|
||||
@ r4 = opus_val16 *x
|
||||
@ r5 = opus_val16 *y
|
||||
@ q0 = opus_val32 sum[4]
|
||||
@ output:
|
||||
@ q0 = opus_val32 sum[4]
|
||||
@ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
|
||||
@ internal usage:
|
||||
@ r12 = int j
|
||||
@ d3 = y_3|y_2|y_1|y_0
|
||||
@ q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
|
||||
@ q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
|
||||
@ q8 = scratch
|
||||
@
|
||||
@ Load y[0...3]
|
||||
@ This requires len>0 to always be valid (which we assert in the C code).
|
||||
VLD1.16 {d5}, [r5]!
|
||||
SUBS r12, r3, #8
|
||||
BLE xcorr_kernel_neon_process4
|
||||
@ Process 8 samples at a time.
|
||||
@ This loop loads one y value more than we actually need. Therefore we have to
|
||||
@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
|
||||
@ reading past the end of the array.
|
||||
xcorr_kernel_neon_process8:
|
||||
@ This loop has 19 total instructions (10 cycles to issue, minimum), with
|
||||
@ - 2 cycles of ARM insrtuctions,
|
||||
@ - 10 cycles of load/store/byte permute instructions, and
|
||||
@ - 9 cycles of data processing instructions.
|
||||
@ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
|
||||
@ latter two categories, meaning the whole loop should run in 10 cycles per
|
||||
@ iteration, barring cache misses.
|
||||
@
|
||||
@ Load x[0...7]
|
||||
VLD1.16 {d6, d7}, [r4]!
|
||||
@ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
|
||||
@ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
|
||||
VAND d3, d5, d5
|
||||
SUBS r12, r12, #8
|
||||
@ Load y[4...11]
|
||||
VLD1.16 {d4, d5}, [r5]!
|
||||
VMLAL.S16 q0, d3, d6[0]
|
||||
VEXT.16 d16, d3, d4, #1
|
||||
VMLAL.S16 q0, d4, d7[0]
|
||||
VEXT.16 d17, d4, d5, #1
|
||||
VMLAL.S16 q0, d16, d6[1]
|
||||
VEXT.16 d16, d3, d4, #2
|
||||
VMLAL.S16 q0, d17, d7[1]
|
||||
VEXT.16 d17, d4, d5, #2
|
||||
VMLAL.S16 q0, d16, d6[2]
|
||||
VEXT.16 d16, d3, d4, #3
|
||||
VMLAL.S16 q0, d17, d7[2]
|
||||
VEXT.16 d17, d4, d5, #3
|
||||
VMLAL.S16 q0, d16, d6[3]
|
||||
VMLAL.S16 q0, d17, d7[3]
|
||||
BGT xcorr_kernel_neon_process8
|
||||
@ Process 4 samples here if we have > 4 left (still reading one extra y value).
|
||||
xcorr_kernel_neon_process4:
|
||||
ADDS r12, r12, #4
|
||||
BLE xcorr_kernel_neon_process2
|
||||
@ Load x[0...3]
|
||||
VLD1.16 d6, [r4]!
|
||||
@ Use VAND since it's a data processing instruction again.
|
||||
VAND d4, d5, d5
|
||||
SUB r12, r12, #4
|
||||
@ Load y[4...7]
|
||||
VLD1.16 d5, [r5]!
|
||||
VMLAL.S16 q0, d4, d6[0]
|
||||
VEXT.16 d16, d4, d5, #1
|
||||
VMLAL.S16 q0, d16, d6[1]
|
||||
VEXT.16 d16, d4, d5, #2
|
||||
VMLAL.S16 q0, d16, d6[2]
|
||||
VEXT.16 d16, d4, d5, #3
|
||||
VMLAL.S16 q0, d16, d6[3]
|
||||
@ Process 2 samples here if we have > 2 left (still reading one extra y value).
|
||||
xcorr_kernel_neon_process2:
|
||||
ADDS r12, r12, #2
|
||||
BLE xcorr_kernel_neon_process1
|
||||
@ Load x[0...1]
|
||||
VLD2.16 {d6[],d7[]}, [r4]!
|
||||
@ Use VAND since it's a data processing instruction again.
|
||||
VAND d4, d5, d5
|
||||
SUB r12, r12, #2
|
||||
@ Load y[4...5]
|
||||
VLD1.32 {d5[]}, [r5]!
|
||||
VMLAL.S16 q0, d4, d6
|
||||
VEXT.16 d16, d4, d5, #1
|
||||
@ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
|
||||
@ instead of VEXT, since it's a data-processing instruction.
|
||||
VSRI.64 d5, d4, #32
|
||||
VMLAL.S16 q0, d16, d7
|
||||
@ Process 1 sample using the extra y value we loaded above.
|
||||
xcorr_kernel_neon_process1:
|
||||
@ Load next *x
|
||||
VLD1.16 {d6[]}, [r4]!
|
||||
ADDS r12, r12, #1
|
||||
@ y[0...3] are left in d5 from prior iteration(s) (if any)
|
||||
VMLAL.S16 q0, d5, d6
|
||||
MOVLE pc, lr
|
||||
@ Now process 1 last sample, not reading ahead.
|
||||
@ Load last *y
|
||||
VLD1.16 {d4[]}, [r5]!
|
||||
VSRI.64 d4, d5, #16
|
||||
@ Load last *x
|
||||
VLD1.16 {d6[]}, [r4]!
|
||||
VMLAL.S16 q0, d4, d6
|
||||
MOV pc, lr
|
||||
.size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP
|
||||
|
||||
@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
|
||||
@ opus_val32 *xcorr, int len, int max_pitch, int arch)
|
||||
.type celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
|
||||
@ input:
|
||||
@ r0 = opus_val16 *_x
|
||||
@ r1 = opus_val16 *_y
|
||||
@ r2 = opus_val32 *xcorr
|
||||
@ r3 = int len
|
||||
@ output:
|
||||
@ r0 = int maxcorr
|
||||
@ internal usage:
|
||||
@ r4 = opus_val16 *x (for xcorr_kernel_neon())
|
||||
@ r5 = opus_val16 *y (for xcorr_kernel_neon())
|
||||
@ r6 = int max_pitch
|
||||
@ r12 = int j
|
||||
@ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
|
||||
@ ignored:
|
||||
@ int arch
|
||||
STMFD sp!, {r4-r6, lr}
|
||||
LDR r6, [sp, #16]
|
||||
VMOV.S32 q15, #1
|
||||
@ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
|
||||
SUBS r6, r6, #4
|
||||
BLT celt_pitch_xcorr_neon_process4_done
|
||||
celt_pitch_xcorr_neon_process4:
|
||||
@ xcorr_kernel_neon parameters:
|
||||
@ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
|
||||
MOV r4, r0
|
||||
MOV r5, r1
|
||||
VEOR q0, q0, q0
|
||||
@ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
|
||||
@ So we don't save/restore any other registers.
|
||||
BL xcorr_kernel_neon_start
|
||||
SUBS r6, r6, #4
|
||||
VST1.32 {q0}, [r2]!
|
||||
@ _y += 4
|
||||
ADD r1, r1, #8
|
||||
VMAX.S32 q15, q15, q0
|
||||
@ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
|
||||
BGE celt_pitch_xcorr_neon_process4
|
||||
@ We have less than 4 sums left to compute.
|
||||
celt_pitch_xcorr_neon_process4_done:
|
||||
ADDS r6, r6, #4
|
||||
@ Reduce maxcorr to a single value
|
||||
VMAX.S32 d30, d30, d31
|
||||
VPMAX.S32 d30, d30, d30
|
||||
@ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
|
||||
BLE celt_pitch_xcorr_neon_done
|
||||
@ Now compute each remaining sum one at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining:
|
||||
MOV r4, r0
|
||||
MOV r5, r1
|
||||
VMOV.I32 q0, #0
|
||||
SUBS r12, r3, #8
|
||||
BLT celt_pitch_xcorr_neon_process_remaining4
|
||||
@ Sum terms 8 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining_loop8:
|
||||
@ Load x[0...7]
|
||||
VLD1.16 {q1}, [r4]!
|
||||
@ Load y[0...7]
|
||||
VLD1.16 {q2}, [r5]!
|
||||
SUBS r12, r12, #8
|
||||
VMLAL.S16 q0, d4, d2
|
||||
VMLAL.S16 q0, d5, d3
|
||||
BGE celt_pitch_xcorr_neon_process_remaining_loop8
|
||||
@ Sum terms 4 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining4:
|
||||
ADDS r12, r12, #4
|
||||
BLT celt_pitch_xcorr_neon_process_remaining4_done
|
||||
@ Load x[0...3]
|
||||
VLD1.16 {d2}, [r4]!
|
||||
@ Load y[0...3]
|
||||
VLD1.16 {d3}, [r5]!
|
||||
SUB r12, r12, #4
|
||||
VMLAL.S16 q0, d3, d2
|
||||
celt_pitch_xcorr_neon_process_remaining4_done:
|
||||
@ Reduce the sum to a single value.
|
||||
VADD.S32 d0, d0, d1
|
||||
VPADDL.S32 d0, d0
|
||||
ADDS r12, r12, #4
|
||||
BLE celt_pitch_xcorr_neon_process_remaining_loop_done
|
||||
@ Sum terms 1 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining_loop1:
|
||||
VLD1.16 {d2[]}, [r4]!
|
||||
VLD1.16 {d3[]}, [r5]!
|
||||
SUBS r12, r12, #1
|
||||
VMLAL.S16 q0, d2, d3
|
||||
BGT celt_pitch_xcorr_neon_process_remaining_loop1
|
||||
celt_pitch_xcorr_neon_process_remaining_loop_done:
|
||||
VST1.32 {d0[0]}, [r2]!
|
||||
VMAX.S32 d30, d30, d0
|
||||
SUBS r6, r6, #1
|
||||
@ _y++
|
||||
ADD r1, r1, #2
|
||||
@ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
|
||||
BGT celt_pitch_xcorr_neon_process_remaining
|
||||
celt_pitch_xcorr_neon_done:
|
||||
VMOV.32 r0, d30[0]
|
||||
LDMFD sp!, {r4-r6, pc}
|
||||
.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon @ ENDP
|
||||
|
||||
.endif
|
||||
|
||||
.if OPUS_ARM_MAY_HAVE_EDSP
|
||||
|
||||
@ This will get used on ARMv7 devices without NEON, so it has been optimized
|
||||
@ to take advantage of dual-issuing where possible.
|
||||
.type xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
|
||||
xcorr_kernel_edsp_start:
|
||||
@ input:
|
||||
@ r3 = int len
|
||||
@ r4 = opus_val16 *_x (must be 32-bit aligned)
|
||||
@ r5 = opus_val16 *_y (must be 32-bit aligned)
|
||||
@ r6...r9 = opus_val32 sum[4]
|
||||
@ output:
|
||||
@ r6...r9 = opus_val32 sum[4]
|
||||
@ preserved: r0-r5
|
||||
@ internal usage
|
||||
@ r2 = int j
|
||||
@ r12,r14 = opus_val16 x[4]
|
||||
@ r10,r11 = opus_val16 y[4]
|
||||
STMFD sp!, {r2,r4,r5,lr}
|
||||
LDR r10, [r5], #4 @ Load y[0...1]
|
||||
SUBS r2, r3, #4 @ j = len-4
|
||||
LDR r11, [r5], #4 @ Load y[2...3]
|
||||
BLE xcorr_kernel_edsp_process4_done
|
||||
LDR r12, [r4], #4 @ Load x[0...1]
|
||||
@ Stall
|
||||
xcorr_kernel_edsp_process4:
|
||||
@ The multiplies must issue from pipeline 0, and can't dual-issue with each
|
||||
@ other. Every other instruction here dual-issues with a multiply, and is
|
||||
@ thus "free". There should be no stalls in the body of the loop.
|
||||
SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_0,y_0)
|
||||
LDR r14, [r4], #4 @ Load x[2...3]
|
||||
SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x_0,y_1)
|
||||
SUBS r2, r2, #4 @ j-=4
|
||||
SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_0,y_2)
|
||||
SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x_0,y_3)
|
||||
SMLATT r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_1,y_1)
|
||||
LDR r10, [r5], #4 @ Load y[4...5]
|
||||
SMLATB r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],x_1,y_2)
|
||||
SMLATT r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_1,y_3)
|
||||
SMLATB r9, r12, r10, r9 @ sum[3] = MAC16_16(sum[3],x_1,y_4)
|
||||
LDRGT r12, [r4], #4 @ Load x[0...1]
|
||||
SMLABB r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_2,y_2)
|
||||
SMLABT r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x_2,y_3)
|
||||
SMLABB r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_2,y_4)
|
||||
SMLABT r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x_2,y_5)
|
||||
SMLATT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_3,y_3)
|
||||
LDR r11, [r5], #4 @ Load y[6...7]
|
||||
SMLATB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],x_3,y_4)
|
||||
SMLATT r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_3,y_5)
|
||||
SMLATB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],x_3,y_6)
|
||||
BGT xcorr_kernel_edsp_process4
|
||||
xcorr_kernel_edsp_process4_done:
|
||||
ADDS r2, r2, #4
|
||||
BLE xcorr_kernel_edsp_done
|
||||
LDRH r12, [r4], #2 @ r12 = *x++
|
||||
SUBS r2, r2, #1 @ j--
|
||||
@ Stall
|
||||
SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_0)
|
||||
LDRHGT r14, [r4], #2 @ r14 = *x++
|
||||
SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x,y_1)
|
||||
SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_2)
|
||||
SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x,y_3)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABT r6, r14, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_1)
|
||||
SUBS r2, r2, #1 @ j--
|
||||
SMLABB r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x,y_2)
|
||||
LDRH r10, [r5], #2 @ r10 = y_4 = *y++
|
||||
SMLABT r8, r14, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_3)
|
||||
LDRHGT r12, [r4], #2 @ r12 = *x++
|
||||
SMLABB r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x,y_4)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABB r6, r12, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_2)
|
||||
CMP r2, #1 @ j--
|
||||
SMLABT r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_3)
|
||||
LDRH r2, [r5], #2 @ r2 = y_5 = *y++
|
||||
SMLABB r8, r12, r10, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_4)
|
||||
LDRHGT r14, [r4] @ r14 = *x
|
||||
SMLABB r9, r12, r2, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_5)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_3)
|
||||
LDRH r11, [r5] @ r11 = y_6 = *y
|
||||
SMLABB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_4)
|
||||
SMLABB r8, r14, r2, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_5)
|
||||
SMLABB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_6)
|
||||
xcorr_kernel_edsp_done:
|
||||
LDMFD sp!, {r2,r4,r5,pc}
|
||||
.size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP
|
||||
|
||||
.type celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
|
||||
@ input:
|
||||
@ r0 = opus_val16 *_x (must be 32-bit aligned)
|
||||
@ r1 = opus_val16 *_y (only needs to be 16-bit aligned)
|
||||
@ r2 = opus_val32 *xcorr
|
||||
@ r3 = int len
|
||||
@ output:
|
||||
@ r0 = maxcorr
|
||||
@ internal usage
|
||||
@ r4 = opus_val16 *x
|
||||
@ r5 = opus_val16 *y
|
||||
@ r6 = opus_val32 sum0
|
||||
@ r7 = opus_val32 sum1
|
||||
@ r8 = opus_val32 sum2
|
||||
@ r9 = opus_val32 sum3
|
||||
@ r1 = int max_pitch
|
||||
@ r12 = int j
|
||||
@ ignored:
|
||||
@ int arch
|
||||
STMFD sp!, {r4-r11, lr}
|
||||
MOV r5, r1
|
||||
LDR r1, [sp, #36]
|
||||
MOV r4, r0
|
||||
TST r5, #3
|
||||
@ maxcorr = 1
|
||||
MOV r0, #1
|
||||
BEQ celt_pitch_xcorr_edsp_process1u_done
|
||||
@ Compute one sum at the start to make y 32-bit aligned.
|
||||
SUBS r12, r3, #4
|
||||
@ r14 = sum = 0
|
||||
MOV r14, #0
|
||||
LDRH r8, [r5], #2
|
||||
BLE celt_pitch_xcorr_edsp_process1u_loop4_done
|
||||
LDR r6, [r4], #4
|
||||
MOV r8, r8, LSL #16
|
||||
celt_pitch_xcorr_edsp_process1u_loop4:
|
||||
LDR r9, [r5], #4
|
||||
SMLABT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
|
||||
LDR r7, [r4], #4
|
||||
SMLATB r14, r6, r9, r14 @ sum = MAC16_16(sum, x_1, y_1)
|
||||
LDR r8, [r5], #4
|
||||
SMLABT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2)
|
||||
SUBS r12, r12, #4 @ j-=4
|
||||
SMLATB r14, r7, r8, r14 @ sum = MAC16_16(sum, x_3, y_3)
|
||||
LDRGT r6, [r4], #4
|
||||
BGT celt_pitch_xcorr_edsp_process1u_loop4
|
||||
MOV r8, r8, LSR #16
|
||||
celt_pitch_xcorr_edsp_process1u_loop4_done:
|
||||
ADDS r12, r12, #4
|
||||
celt_pitch_xcorr_edsp_process1u_loop1:
|
||||
LDRHGE r6, [r4], #2
|
||||
@ Stall
|
||||
SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y)
|
||||
SUBSGE r12, r12, #1
|
||||
LDRHGT r8, [r5], #2
|
||||
BGT celt_pitch_xcorr_edsp_process1u_loop1
|
||||
@ Restore _x
|
||||
SUB r4, r4, r3, LSL #1
|
||||
@ Restore and advance _y
|
||||
SUB r5, r5, r3, LSL #1
|
||||
@ maxcorr = max(maxcorr, sum)
|
||||
CMP r0, r14
|
||||
ADD r5, r5, #2
|
||||
MOVLT r0, r14
|
||||
SUBS r1, r1, #1
|
||||
@ xcorr[i] = sum
|
||||
STR r14, [r2], #4
|
||||
BLE celt_pitch_xcorr_edsp_done
|
||||
celt_pitch_xcorr_edsp_process1u_done:
|
||||
@ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
|
||||
SUBS r1, r1, #4
|
||||
BLT celt_pitch_xcorr_edsp_process2
|
||||
celt_pitch_xcorr_edsp_process4:
|
||||
@ xcorr_kernel_edsp parameters:
|
||||
@ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
|
||||
MOV r6, #0
|
||||
MOV r7, #0
|
||||
MOV r8, #0
|
||||
MOV r9, #0
|
||||
BL xcorr_kernel_edsp_start @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
|
||||
@ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
|
||||
CMP r0, r6
|
||||
@ _y+=4
|
||||
ADD r5, r5, #8
|
||||
MOVLT r0, r6
|
||||
CMP r0, r7
|
||||
MOVLT r0, r7
|
||||
CMP r0, r8
|
||||
MOVLT r0, r8
|
||||
CMP r0, r9
|
||||
MOVLT r0, r9
|
||||
STMIA r2!, {r6-r9}
|
||||
SUBS r1, r1, #4
|
||||
BGE celt_pitch_xcorr_edsp_process4
|
||||
celt_pitch_xcorr_edsp_process2:
|
||||
ADDS r1, r1, #2
|
||||
BLT celt_pitch_xcorr_edsp_process1a
|
||||
SUBS r12, r3, #4
|
||||
@ {r10, r11} = {sum0, sum1} = {0, 0}
|
||||
MOV r10, #0
|
||||
MOV r11, #0
|
||||
LDR r8, [r5], #4
|
||||
BLE celt_pitch_xcorr_edsp_process2_loop_done
|
||||
LDR r6, [r4], #4
|
||||
LDR r9, [r5], #4
|
||||
celt_pitch_xcorr_edsp_process2_loop4:
|
||||
SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDR r7, [r4], #4
|
||||
SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
SUBS r12, r12, #4 @ j-=4
|
||||
SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1)
|
||||
LDR r8, [r5], #4
|
||||
SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2)
|
||||
LDRGT r6, [r4], #4
|
||||
SMLABB r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_2, y_2)
|
||||
SMLABT r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_2, y_3)
|
||||
SMLATT r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_3, y_3)
|
||||
LDRGT r9, [r5], #4
|
||||
SMLATB r11, r7, r8, r11 @ sum1 = MAC16_16(sum1, x_3, y_4)
|
||||
BGT celt_pitch_xcorr_edsp_process2_loop4
|
||||
celt_pitch_xcorr_edsp_process2_loop_done:
|
||||
ADDS r12, r12, #2
|
||||
BLE celt_pitch_xcorr_edsp_process2_1
|
||||
LDR r6, [r4], #4
|
||||
@ Stall
|
||||
SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDR r9, [r5], #4
|
||||
SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
SUB r12, r12, #2
|
||||
SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1)
|
||||
MOV r8, r9
|
||||
SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2)
|
||||
celt_pitch_xcorr_edsp_process2_1:
|
||||
LDRH r6, [r4], #2
|
||||
ADDS r12, r12, #1
|
||||
@ Stall
|
||||
SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDRHGT r7, [r4], #2
|
||||
SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
BLE celt_pitch_xcorr_edsp_process2_done
|
||||
LDRH r9, [r5], #2
|
||||
SMLABT r10, r7, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_1)
|
||||
SMLABB r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_0, y_2)
|
||||
celt_pitch_xcorr_edsp_process2_done:
|
||||
@ Restore _x
|
||||
SUB r4, r4, r3, LSL #1
|
||||
@ Restore and advance _y
|
||||
SUB r5, r5, r3, LSL #1
|
||||
@ maxcorr = max(maxcorr, sum0)
|
||||
CMP r0, r10
|
||||
ADD r5, r5, #2
|
||||
MOVLT r0, r10
|
||||
SUB r1, r1, #2
|
||||
@ maxcorr = max(maxcorr, sum1)
|
||||
CMP r0, r11
|
||||
@ xcorr[i] = sum
|
||||
STR r10, [r2], #4
|
||||
MOVLT r0, r11
|
||||
STR r11, [r2], #4
|
||||
celt_pitch_xcorr_edsp_process1a:
|
||||
ADDS r1, r1, #1
|
||||
BLT celt_pitch_xcorr_edsp_done
|
||||
SUBS r12, r3, #4
|
||||
@ r14 = sum = 0
|
||||
MOV r14, #0
|
||||
BLT celt_pitch_xcorr_edsp_process1a_loop_done
|
||||
LDR r6, [r4], #4
|
||||
LDR r8, [r5], #4
|
||||
LDR r7, [r4], #4
|
||||
LDR r9, [r5], #4
|
||||
celt_pitch_xcorr_edsp_process1a_loop4:
|
||||
SMLABB r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
|
||||
SUBS r12, r12, #4 @ j-=4
|
||||
SMLATT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1)
|
||||
LDRGE r6, [r4], #4
|
||||
SMLABB r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2)
|
||||
LDRGE r8, [r5], #4
|
||||
SMLATT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_3, y_3)
|
||||
LDRGE r7, [r4], #4
|
||||
LDRGE r9, [r5], #4
|
||||
BGE celt_pitch_xcorr_edsp_process1a_loop4
|
||||
celt_pitch_xcorr_edsp_process1a_loop_done:
|
||||
ADDS r12, r12, #2
|
||||
LDRGE r6, [r4], #4
|
||||
LDRGE r8, [r5], #4
|
||||
@ Stall
|
||||
SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
|
||||
SUBGE r12, r12, #2
|
||||
SMLATTGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1)
|
||||
ADDS r12, r12, #1
|
||||
LDRHGE r6, [r4], #2
|
||||
LDRHGE r8, [r5], #2
|
||||
@ Stall
|
||||
SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y)
|
||||
@ maxcorr = max(maxcorr, sum)
|
||||
CMP r0, r14
|
||||
@ xcorr[i] = sum
|
||||
STR r14, [r2], #4
|
||||
MOVLT r0, r14
|
||||
celt_pitch_xcorr_edsp_done:
|
||||
LDMFD sp!, {r4-r11, pc}
|
||||
.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp @ ENDP
|
||||
|
||||
.endif
|
||||
|
||||
@ END:
|
||||
.section .note.GNU-stack,"",%progbits
|
551
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_pitch_xcorr_arm.s
vendored
Normal file
551
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/celt_pitch_xcorr_arm.s
vendored
Normal file
|
@ -0,0 +1,551 @@
|
|||
; Copyright (c) 2007-2008 CSIRO
|
||||
; Copyright (c) 2007-2009 Xiph.Org Foundation
|
||||
; Copyright (c) 2013 Parrot
|
||||
; Written by Aurélien Zanelli
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
;
|
||||
; - Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
;
|
||||
; - Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in the
|
||||
; documentation and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
GET celt/arm/armopts.s
|
||||
|
||||
IF OPUS_ARM_MAY_HAVE_EDSP
|
||||
EXPORT celt_pitch_xcorr_edsp
|
||||
ENDIF
|
||||
|
||||
IF OPUS_ARM_MAY_HAVE_NEON
|
||||
EXPORT celt_pitch_xcorr_neon
|
||||
ENDIF
|
||||
|
||||
IF OPUS_ARM_MAY_HAVE_NEON
|
||||
|
||||
; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
|
||||
xcorr_kernel_neon PROC
|
||||
xcorr_kernel_neon_start
|
||||
; input:
|
||||
; r3 = int len
|
||||
; r4 = opus_val16 *x
|
||||
; r5 = opus_val16 *y
|
||||
; q0 = opus_val32 sum[4]
|
||||
; output:
|
||||
; q0 = opus_val32 sum[4]
|
||||
; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
|
||||
; internal usage:
|
||||
; r12 = int j
|
||||
; d3 = y_3|y_2|y_1|y_0
|
||||
; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
|
||||
; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
|
||||
; q8 = scratch
|
||||
;
|
||||
; Load y[0...3]
|
||||
; This requires len>0 to always be valid (which we assert in the C code).
|
||||
VLD1.16 {d5}, [r5]!
|
||||
SUBS r12, r3, #8
|
||||
BLE xcorr_kernel_neon_process4
|
||||
; Process 8 samples at a time.
|
||||
; This loop loads one y value more than we actually need. Therefore we have to
|
||||
; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
|
||||
; reading past the end of the array.
|
||||
xcorr_kernel_neon_process8
|
||||
; This loop has 19 total instructions (10 cycles to issue, minimum), with
|
||||
; - 2 cycles of ARM insrtuctions,
|
||||
; - 10 cycles of load/store/byte permute instructions, and
|
||||
; - 9 cycles of data processing instructions.
|
||||
; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
|
||||
; latter two categories, meaning the whole loop should run in 10 cycles per
|
||||
; iteration, barring cache misses.
|
||||
;
|
||||
; Load x[0...7]
|
||||
VLD1.16 {d6, d7}, [r4]!
|
||||
; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
|
||||
; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
|
||||
VAND d3, d5, d5
|
||||
SUBS r12, r12, #8
|
||||
; Load y[4...11]
|
||||
VLD1.16 {d4, d5}, [r5]!
|
||||
VMLAL.S16 q0, d3, d6[0]
|
||||
VEXT.16 d16, d3, d4, #1
|
||||
VMLAL.S16 q0, d4, d7[0]
|
||||
VEXT.16 d17, d4, d5, #1
|
||||
VMLAL.S16 q0, d16, d6[1]
|
||||
VEXT.16 d16, d3, d4, #2
|
||||
VMLAL.S16 q0, d17, d7[1]
|
||||
VEXT.16 d17, d4, d5, #2
|
||||
VMLAL.S16 q0, d16, d6[2]
|
||||
VEXT.16 d16, d3, d4, #3
|
||||
VMLAL.S16 q0, d17, d7[2]
|
||||
VEXT.16 d17, d4, d5, #3
|
||||
VMLAL.S16 q0, d16, d6[3]
|
||||
VMLAL.S16 q0, d17, d7[3]
|
||||
BGT xcorr_kernel_neon_process8
|
||||
; Process 4 samples here if we have > 4 left (still reading one extra y value).
|
||||
xcorr_kernel_neon_process4
|
||||
ADDS r12, r12, #4
|
||||
BLE xcorr_kernel_neon_process2
|
||||
; Load x[0...3]
|
||||
VLD1.16 d6, [r4]!
|
||||
; Use VAND since it's a data processing instruction again.
|
||||
VAND d4, d5, d5
|
||||
SUB r12, r12, #4
|
||||
; Load y[4...7]
|
||||
VLD1.16 d5, [r5]!
|
||||
VMLAL.S16 q0, d4, d6[0]
|
||||
VEXT.16 d16, d4, d5, #1
|
||||
VMLAL.S16 q0, d16, d6[1]
|
||||
VEXT.16 d16, d4, d5, #2
|
||||
VMLAL.S16 q0, d16, d6[2]
|
||||
VEXT.16 d16, d4, d5, #3
|
||||
VMLAL.S16 q0, d16, d6[3]
|
||||
; Process 2 samples here if we have > 2 left (still reading one extra y value).
|
||||
xcorr_kernel_neon_process2
|
||||
ADDS r12, r12, #2
|
||||
BLE xcorr_kernel_neon_process1
|
||||
; Load x[0...1]
|
||||
VLD2.16 {d6[],d7[]}, [r4]!
|
||||
; Use VAND since it's a data processing instruction again.
|
||||
VAND d4, d5, d5
|
||||
SUB r12, r12, #2
|
||||
; Load y[4...5]
|
||||
VLD1.32 {d5[]}, [r5]!
|
||||
VMLAL.S16 q0, d4, d6
|
||||
VEXT.16 d16, d4, d5, #1
|
||||
; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
|
||||
; instead of VEXT, since it's a data-processing instruction.
|
||||
VSRI.64 d5, d4, #32
|
||||
VMLAL.S16 q0, d16, d7
|
||||
; Process 1 sample using the extra y value we loaded above.
|
||||
xcorr_kernel_neon_process1
|
||||
; Load next *x
|
||||
VLD1.16 {d6[]}, [r4]!
|
||||
ADDS r12, r12, #1
|
||||
; y[0...3] are left in d5 from prior iteration(s) (if any)
|
||||
VMLAL.S16 q0, d5, d6
|
||||
MOVLE pc, lr
|
||||
; Now process 1 last sample, not reading ahead.
|
||||
; Load last *y
|
||||
VLD1.16 {d4[]}, [r5]!
|
||||
VSRI.64 d4, d5, #16
|
||||
; Load last *x
|
||||
VLD1.16 {d6[]}, [r4]!
|
||||
VMLAL.S16 q0, d4, d6
|
||||
MOV pc, lr
|
||||
ENDP
|
||||
|
||||
; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
|
||||
; opus_val32 *xcorr, int len, int max_pitch, int arch)
|
||||
celt_pitch_xcorr_neon PROC
|
||||
; input:
|
||||
; r0 = opus_val16 *_x
|
||||
; r1 = opus_val16 *_y
|
||||
; r2 = opus_val32 *xcorr
|
||||
; r3 = int len
|
||||
; output:
|
||||
; r0 = int maxcorr
|
||||
; internal usage:
|
||||
; r4 = opus_val16 *x (for xcorr_kernel_neon())
|
||||
; r5 = opus_val16 *y (for xcorr_kernel_neon())
|
||||
; r6 = int max_pitch
|
||||
; r12 = int j
|
||||
; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
|
||||
; ignored:
|
||||
; int arch
|
||||
STMFD sp!, {r4-r6, lr}
|
||||
LDR r6, [sp, #16]
|
||||
VMOV.S32 q15, #1
|
||||
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
|
||||
SUBS r6, r6, #4
|
||||
BLT celt_pitch_xcorr_neon_process4_done
|
||||
celt_pitch_xcorr_neon_process4
|
||||
; xcorr_kernel_neon parameters:
|
||||
; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
|
||||
MOV r4, r0
|
||||
MOV r5, r1
|
||||
VEOR q0, q0, q0
|
||||
; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
|
||||
; So we don't save/restore any other registers.
|
||||
BL xcorr_kernel_neon_start
|
||||
SUBS r6, r6, #4
|
||||
VST1.32 {q0}, [r2]!
|
||||
; _y += 4
|
||||
ADD r1, r1, #8
|
||||
VMAX.S32 q15, q15, q0
|
||||
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
|
||||
BGE celt_pitch_xcorr_neon_process4
|
||||
; We have less than 4 sums left to compute.
|
||||
celt_pitch_xcorr_neon_process4_done
|
||||
ADDS r6, r6, #4
|
||||
; Reduce maxcorr to a single value
|
||||
VMAX.S32 d30, d30, d31
|
||||
VPMAX.S32 d30, d30, d30
|
||||
; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
|
||||
BLE celt_pitch_xcorr_neon_done
|
||||
; Now compute each remaining sum one at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining
|
||||
MOV r4, r0
|
||||
MOV r5, r1
|
||||
VMOV.I32 q0, #0
|
||||
SUBS r12, r3, #8
|
||||
BLT celt_pitch_xcorr_neon_process_remaining4
|
||||
; Sum terms 8 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining_loop8
|
||||
; Load x[0...7]
|
||||
VLD1.16 {q1}, [r4]!
|
||||
; Load y[0...7]
|
||||
VLD1.16 {q2}, [r5]!
|
||||
SUBS r12, r12, #8
|
||||
VMLAL.S16 q0, d4, d2
|
||||
VMLAL.S16 q0, d5, d3
|
||||
BGE celt_pitch_xcorr_neon_process_remaining_loop8
|
||||
; Sum terms 4 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining4
|
||||
ADDS r12, r12, #4
|
||||
BLT celt_pitch_xcorr_neon_process_remaining4_done
|
||||
; Load x[0...3]
|
||||
VLD1.16 {d2}, [r4]!
|
||||
; Load y[0...3]
|
||||
VLD1.16 {d3}, [r5]!
|
||||
SUB r12, r12, #4
|
||||
VMLAL.S16 q0, d3, d2
|
||||
celt_pitch_xcorr_neon_process_remaining4_done
|
||||
; Reduce the sum to a single value.
|
||||
VADD.S32 d0, d0, d1
|
||||
VPADDL.S32 d0, d0
|
||||
ADDS r12, r12, #4
|
||||
BLE celt_pitch_xcorr_neon_process_remaining_loop_done
|
||||
; Sum terms 1 at a time.
|
||||
celt_pitch_xcorr_neon_process_remaining_loop1
|
||||
VLD1.16 {d2[]}, [r4]!
|
||||
VLD1.16 {d3[]}, [r5]!
|
||||
SUBS r12, r12, #1
|
||||
VMLAL.S16 q0, d2, d3
|
||||
BGT celt_pitch_xcorr_neon_process_remaining_loop1
|
||||
celt_pitch_xcorr_neon_process_remaining_loop_done
|
||||
VST1.32 {d0[0]}, [r2]!
|
||||
VMAX.S32 d30, d30, d0
|
||||
SUBS r6, r6, #1
|
||||
; _y++
|
||||
ADD r1, r1, #2
|
||||
; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
|
||||
BGT celt_pitch_xcorr_neon_process_remaining
|
||||
celt_pitch_xcorr_neon_done
|
||||
VMOV.32 r0, d30[0]
|
||||
LDMFD sp!, {r4-r6, pc}
|
||||
ENDP
|
||||
|
||||
ENDIF
|
||||
|
||||
IF OPUS_ARM_MAY_HAVE_EDSP
|
||||
|
||||
; This will get used on ARMv7 devices without NEON, so it has been optimized
|
||||
; to take advantage of dual-issuing where possible.
|
||||
xcorr_kernel_edsp PROC
|
||||
xcorr_kernel_edsp_start
|
||||
; input:
|
||||
; r3 = int len
|
||||
; r4 = opus_val16 *_x (must be 32-bit aligned)
|
||||
; r5 = opus_val16 *_y (must be 32-bit aligned)
|
||||
; r6...r9 = opus_val32 sum[4]
|
||||
; output:
|
||||
; r6...r9 = opus_val32 sum[4]
|
||||
; preserved: r0-r5
|
||||
; internal usage
|
||||
; r2 = int j
|
||||
; r12,r14 = opus_val16 x[4]
|
||||
; r10,r11 = opus_val16 y[4]
|
||||
STMFD sp!, {r2,r4,r5,lr}
|
||||
LDR r10, [r5], #4 ; Load y[0...1]
|
||||
SUBS r2, r3, #4 ; j = len-4
|
||||
LDR r11, [r5], #4 ; Load y[2...3]
|
||||
BLE xcorr_kernel_edsp_process4_done
|
||||
LDR r12, [r4], #4 ; Load x[0...1]
|
||||
; Stall
|
||||
xcorr_kernel_edsp_process4
|
||||
; The multiplies must issue from pipeline 0, and can't dual-issue with each
|
||||
; other. Every other instruction here dual-issues with a multiply, and is
|
||||
; thus "free". There should be no stalls in the body of the loop.
|
||||
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
|
||||
LDR r14, [r4], #4 ; Load x[2...3]
|
||||
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
|
||||
SUBS r2, r2, #4 ; j-=4
|
||||
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
|
||||
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
|
||||
SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
|
||||
LDR r10, [r5], #4 ; Load y[4...5]
|
||||
SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
|
||||
SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
|
||||
SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
|
||||
LDRGT r12, [r4], #4 ; Load x[0...1]
|
||||
SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
|
||||
SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
|
||||
SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
|
||||
SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
|
||||
SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
|
||||
LDR r11, [r5], #4 ; Load y[6...7]
|
||||
SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
|
||||
SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
|
||||
SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
|
||||
BGT xcorr_kernel_edsp_process4
|
||||
xcorr_kernel_edsp_process4_done
|
||||
ADDS r2, r2, #4
|
||||
BLE xcorr_kernel_edsp_done
|
||||
LDRH r12, [r4], #2 ; r12 = *x++
|
||||
SUBS r2, r2, #1 ; j--
|
||||
; Stall
|
||||
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
|
||||
LDRHGT r14, [r4], #2 ; r14 = *x++
|
||||
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
|
||||
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
|
||||
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
|
||||
SUBS r2, r2, #1 ; j--
|
||||
SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
|
||||
LDRH r10, [r5], #2 ; r10 = y_4 = *y++
|
||||
SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
|
||||
LDRHGT r12, [r4], #2 ; r12 = *x++
|
||||
SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
|
||||
CMP r2, #1 ; j--
|
||||
SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
|
||||
LDRH r2, [r5], #2 ; r2 = y_5 = *y++
|
||||
SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
|
||||
LDRHGT r14, [r4] ; r14 = *x
|
||||
SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
|
||||
BLE xcorr_kernel_edsp_done
|
||||
SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
|
||||
LDRH r11, [r5] ; r11 = y_6 = *y
|
||||
SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
|
||||
SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
|
||||
SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
|
||||
xcorr_kernel_edsp_done
|
||||
LDMFD sp!, {r2,r4,r5,pc}
|
||||
ENDP
|
||||
|
||||
celt_pitch_xcorr_edsp PROC
|
||||
; input:
|
||||
; r0 = opus_val16 *_x (must be 32-bit aligned)
|
||||
; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
|
||||
; r2 = opus_val32 *xcorr
|
||||
; r3 = int len
|
||||
; output:
|
||||
; r0 = maxcorr
|
||||
; internal usage
|
||||
; r4 = opus_val16 *x
|
||||
; r5 = opus_val16 *y
|
||||
; r6 = opus_val32 sum0
|
||||
; r7 = opus_val32 sum1
|
||||
; r8 = opus_val32 sum2
|
||||
; r9 = opus_val32 sum3
|
||||
; r1 = int max_pitch
|
||||
; r12 = int j
|
||||
; ignored:
|
||||
; int arch
|
||||
STMFD sp!, {r4-r11, lr}
|
||||
MOV r5, r1
|
||||
LDR r1, [sp, #36]
|
||||
MOV r4, r0
|
||||
TST r5, #3
|
||||
; maxcorr = 1
|
||||
MOV r0, #1
|
||||
BEQ celt_pitch_xcorr_edsp_process1u_done
|
||||
; Compute one sum at the start to make y 32-bit aligned.
|
||||
SUBS r12, r3, #4
|
||||
; r14 = sum = 0
|
||||
MOV r14, #0
|
||||
LDRH r8, [r5], #2
|
||||
BLE celt_pitch_xcorr_edsp_process1u_loop4_done
|
||||
LDR r6, [r4], #4
|
||||
MOV r8, r8, LSL #16
|
||||
celt_pitch_xcorr_edsp_process1u_loop4
|
||||
LDR r9, [r5], #4
|
||||
SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
|
||||
LDR r7, [r4], #4
|
||||
SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
|
||||
LDR r8, [r5], #4
|
||||
SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
|
||||
SUBS r12, r12, #4 ; j-=4
|
||||
SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
|
||||
LDRGT r6, [r4], #4
|
||||
BGT celt_pitch_xcorr_edsp_process1u_loop4
|
||||
MOV r8, r8, LSR #16
|
||||
celt_pitch_xcorr_edsp_process1u_loop4_done
|
||||
ADDS r12, r12, #4
|
||||
celt_pitch_xcorr_edsp_process1u_loop1
|
||||
LDRHGE r6, [r4], #2
|
||||
; Stall
|
||||
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
|
||||
SUBSGE r12, r12, #1
|
||||
LDRHGT r8, [r5], #2
|
||||
BGT celt_pitch_xcorr_edsp_process1u_loop1
|
||||
; Restore _x
|
||||
SUB r4, r4, r3, LSL #1
|
||||
; Restore and advance _y
|
||||
SUB r5, r5, r3, LSL #1
|
||||
; maxcorr = max(maxcorr, sum)
|
||||
CMP r0, r14
|
||||
ADD r5, r5, #2
|
||||
MOVLT r0, r14
|
||||
SUBS r1, r1, #1
|
||||
; xcorr[i] = sum
|
||||
STR r14, [r2], #4
|
||||
BLE celt_pitch_xcorr_edsp_done
|
||||
celt_pitch_xcorr_edsp_process1u_done
|
||||
; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
|
||||
SUBS r1, r1, #4
|
||||
BLT celt_pitch_xcorr_edsp_process2
|
||||
celt_pitch_xcorr_edsp_process4
|
||||
; xcorr_kernel_edsp parameters:
|
||||
; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
|
||||
MOV r6, #0
|
||||
MOV r7, #0
|
||||
MOV r8, #0
|
||||
MOV r9, #0
|
||||
BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
|
||||
; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
|
||||
CMP r0, r6
|
||||
; _y+=4
|
||||
ADD r5, r5, #8
|
||||
MOVLT r0, r6
|
||||
CMP r0, r7
|
||||
MOVLT r0, r7
|
||||
CMP r0, r8
|
||||
MOVLT r0, r8
|
||||
CMP r0, r9
|
||||
MOVLT r0, r9
|
||||
STMIA r2!, {r6-r9}
|
||||
SUBS r1, r1, #4
|
||||
BGE celt_pitch_xcorr_edsp_process4
|
||||
celt_pitch_xcorr_edsp_process2
|
||||
ADDS r1, r1, #2
|
||||
BLT celt_pitch_xcorr_edsp_process1a
|
||||
SUBS r12, r3, #4
|
||||
; {r10, r11} = {sum0, sum1} = {0, 0}
|
||||
MOV r10, #0
|
||||
MOV r11, #0
|
||||
LDR r8, [r5], #4
|
||||
BLE celt_pitch_xcorr_edsp_process2_loop_done
|
||||
LDR r6, [r4], #4
|
||||
LDR r9, [r5], #4
|
||||
celt_pitch_xcorr_edsp_process2_loop4
|
||||
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDR r7, [r4], #4
|
||||
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
SUBS r12, r12, #4 ; j-=4
|
||||
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
|
||||
LDR r8, [r5], #4
|
||||
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
|
||||
LDRGT r6, [r4], #4
|
||||
SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
|
||||
SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
|
||||
SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
|
||||
LDRGT r9, [r5], #4
|
||||
SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
|
||||
BGT celt_pitch_xcorr_edsp_process2_loop4
|
||||
celt_pitch_xcorr_edsp_process2_loop_done
|
||||
ADDS r12, r12, #2
|
||||
BLE celt_pitch_xcorr_edsp_process2_1
|
||||
LDR r6, [r4], #4
|
||||
; Stall
|
||||
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDR r9, [r5], #4
|
||||
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
SUB r12, r12, #2
|
||||
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
|
||||
MOV r8, r9
|
||||
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
|
||||
celt_pitch_xcorr_edsp_process2_1
|
||||
LDRH r6, [r4], #2
|
||||
ADDS r12, r12, #1
|
||||
; Stall
|
||||
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
|
||||
LDRHGT r7, [r4], #2
|
||||
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
|
||||
BLE celt_pitch_xcorr_edsp_process2_done
|
||||
LDRH r9, [r5], #2
|
||||
SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
|
||||
SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
|
||||
celt_pitch_xcorr_edsp_process2_done
|
||||
; Restore _x
|
||||
SUB r4, r4, r3, LSL #1
|
||||
; Restore and advance _y
|
||||
SUB r5, r5, r3, LSL #1
|
||||
; maxcorr = max(maxcorr, sum0)
|
||||
CMP r0, r10
|
||||
ADD r5, r5, #2
|
||||
MOVLT r0, r10
|
||||
SUB r1, r1, #2
|
||||
; maxcorr = max(maxcorr, sum1)
|
||||
CMP r0, r11
|
||||
; xcorr[i] = sum
|
||||
STR r10, [r2], #4
|
||||
MOVLT r0, r11
|
||||
STR r11, [r2], #4
|
||||
celt_pitch_xcorr_edsp_process1a
|
||||
ADDS r1, r1, #1
|
||||
BLT celt_pitch_xcorr_edsp_done
|
||||
SUBS r12, r3, #4
|
||||
; r14 = sum = 0
|
||||
MOV r14, #0
|
||||
BLT celt_pitch_xcorr_edsp_process1a_loop_done
|
||||
LDR r6, [r4], #4
|
||||
LDR r8, [r5], #4
|
||||
LDR r7, [r4], #4
|
||||
LDR r9, [r5], #4
|
||||
celt_pitch_xcorr_edsp_process1a_loop4
|
||||
SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
|
||||
SUBS r12, r12, #4 ; j-=4
|
||||
SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
|
||||
LDRGE r6, [r4], #4
|
||||
SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
|
||||
LDRGE r8, [r5], #4
|
||||
SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
|
||||
LDRGE r7, [r4], #4
|
||||
LDRGE r9, [r5], #4
|
||||
BGE celt_pitch_xcorr_edsp_process1a_loop4
|
||||
celt_pitch_xcorr_edsp_process1a_loop_done
|
||||
ADDS r12, r12, #2
|
||||
LDRGE r6, [r4], #4
|
||||
LDRGE r8, [r5], #4
|
||||
; Stall
|
||||
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
|
||||
SUBGE r12, r12, #2
|
||||
SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
|
||||
ADDS r12, r12, #1
|
||||
LDRHGE r6, [r4], #2
|
||||
LDRHGE r8, [r5], #2
|
||||
; Stall
|
||||
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
|
||||
; maxcorr = max(maxcorr, sum)
|
||||
CMP r0, r14
|
||||
; xcorr[i] = sum
|
||||
STR r14, [r2], #4
|
||||
MOVLT r0, r14
|
||||
celt_pitch_xcorr_edsp_done
|
||||
LDMFD sp!, {r4-r11, pc}
|
||||
ENDP
|
||||
|
||||
ENDIF
|
||||
|
||||
END
|
71
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fft_arm.h
vendored
Normal file
71
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fft_arm.h
vendored
Normal file
|
@ -0,0 +1,71 @@
|
|||
/* Copyright (c) 2015 Xiph.Org Foundation
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file fft_arm.h
|
||||
@brief ARM Neon Intrinsic optimizations for fft using NE10 library
|
||||
*/
|
||||
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#if !defined(FFT_ARM_H)
|
||||
#define FFT_ARM_H
|
||||
|
||||
#include "kiss_fft.h"
|
||||
|
||||
#if defined(HAVE_ARM_NE10)
|
||||
|
||||
int opus_fft_alloc_arm_neon(kiss_fft_state *st);
|
||||
void opus_fft_free_arm_neon(kiss_fft_state *st);
|
||||
|
||||
void opus_fft_neon(const kiss_fft_state *st,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout);
|
||||
|
||||
void opus_ifft_neon(const kiss_fft_state *st,
|
||||
const kiss_fft_cpx *fin,
|
||||
kiss_fft_cpx *fout);
|
||||
|
||||
#if !defined(OPUS_HAVE_RTCD)
|
||||
#define OVERRIDE_OPUS_FFT (1)
|
||||
|
||||
#define opus_fft_alloc_arch(_st, arch) \
|
||||
((void)(arch), opus_fft_alloc_arm_neon(_st))
|
||||
|
||||
#define opus_fft_free_arch(_st, arch) \
|
||||
((void)(arch), opus_fft_free_arm_neon(_st))
|
||||
|
||||
#define opus_fft(_st, _fin, _fout, arch) \
|
||||
((void)(arch), opus_fft_neon(_st, _fin, _fout))
|
||||
|
||||
#define opus_ifft(_st, _fin, _fout, arch) \
|
||||
((void)(arch), opus_ifft_neon(_st, _fin, _fout))
|
||||
|
||||
#endif /* OPUS_HAVE_RTCD */
|
||||
|
||||
#endif /* HAVE_ARM_NE10 */
|
||||
|
||||
#endif
|
35
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_arm64.h
vendored
Normal file
35
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_arm64.h
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
/* Copyright (C) 2015 Vidyo */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef FIXED_ARM64_H
|
||||
#define FIXED_ARM64_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#undef SIG2WORD16
|
||||
#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT)))
|
||||
|
||||
#endif
|
80
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_armv4.h
vendored
Normal file
80
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_armv4.h
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef FIXED_ARMv4_H
|
||||
#define FIXED_ARMv4_H
|
||||
|
||||
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
|
||||
#undef MULT16_32_Q16
|
||||
static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
|
||||
{
|
||||
unsigned rd_lo;
|
||||
int rd_hi;
|
||||
__asm__(
|
||||
"#MULT16_32_Q16\n\t"
|
||||
"smull %0, %1, %2, %3\n\t"
|
||||
: "=&r"(rd_lo), "=&r"(rd_hi)
|
||||
: "%r"(b),"r"(SHL32(a,16))
|
||||
);
|
||||
return rd_hi;
|
||||
}
|
||||
#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
|
||||
|
||||
|
||||
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
|
||||
#undef MULT16_32_Q15
|
||||
static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
|
||||
{
|
||||
unsigned rd_lo;
|
||||
int rd_hi;
|
||||
__asm__(
|
||||
"#MULT16_32_Q15\n\t"
|
||||
"smull %0, %1, %2, %3\n\t"
|
||||
: "=&r"(rd_lo), "=&r"(rd_hi)
|
||||
: "%r"(b), "r"(SHL32(a,16))
|
||||
);
|
||||
/*We intentionally don't OR in the high bit of rd_lo for speed.*/
|
||||
return SHL32(rd_hi,1);
|
||||
}
|
||||
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
|
||||
|
||||
|
||||
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
|
||||
b must fit in 31 bits.
|
||||
Result fits in 32 bits. */
|
||||
#undef MAC16_32_Q15
|
||||
#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
|
||||
|
||||
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
|
||||
Result fits in 32 bits. */
|
||||
#undef MAC16_32_Q16
|
||||
#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
|
||||
|
||||
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
|
||||
#undef MULT32_32_Q31
|
||||
#define MULT32_32_Q31(a,b) (opus_val32)((((opus_int64)(a)) * ((opus_int64)(b)))>>31)
|
||||
|
||||
#endif
|
151
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_armv5e.h
vendored
Normal file
151
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/fixed_armv5e.h
vendored
Normal file
|
@ -0,0 +1,151 @@
|
|||
/* Copyright (C) 2007-2009 Xiph.Org Foundation
|
||||
Copyright (C) 2003-2008 Jean-Marc Valin
|
||||
Copyright (C) 2007-2008 CSIRO
|
||||
Copyright (C) 2013 Parrot */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef FIXED_ARMv5E_H
|
||||
#define FIXED_ARMv5E_H
|
||||
|
||||
#include "fixed_armv4.h"
|
||||
|
||||
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
|
||||
#undef MULT16_32_Q16
|
||||
static OPUS_INLINE opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MULT16_32_Q16\n\t"
|
||||
"smulwb %0, %1, %2\n\t"
|
||||
: "=r"(res)
|
||||
: "r"(b),"r"(a)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
|
||||
|
||||
|
||||
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
|
||||
#undef MULT16_32_Q15
|
||||
static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MULT16_32_Q15\n\t"
|
||||
"smulwb %0, %1, %2\n\t"
|
||||
: "=r"(res)
|
||||
: "r"(b), "r"(a)
|
||||
);
|
||||
return SHL32(res,1);
|
||||
}
|
||||
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
|
||||
|
||||
|
||||
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
|
||||
b must fit in 31 bits.
|
||||
Result fits in 32 bits. */
|
||||
#undef MAC16_32_Q15
|
||||
static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
|
||||
opus_val32 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MAC16_32_Q15\n\t"
|
||||
"smlawb %0, %1, %2, %3;\n"
|
||||
: "=r"(res)
|
||||
: "r"(SHL32(b,1)), "r"(a), "r"(c)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
|
||||
|
||||
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
|
||||
Result fits in 32 bits. */
|
||||
#undef MAC16_32_Q16
|
||||
static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
|
||||
opus_val32 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MAC16_32_Q16\n\t"
|
||||
"smlawb %0, %1, %2, %3;\n"
|
||||
: "=r"(res)
|
||||
: "r"(b), "r"(a), "r"(c)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
|
||||
|
||||
/** 16x16 multiply-add where the result fits in 32 bits */
|
||||
#undef MAC16_16
|
||||
static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
|
||||
opus_val16 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MAC16_16\n\t"
|
||||
"smlabb %0, %1, %2, %3;\n"
|
||||
: "=r"(res)
|
||||
: "r"(a), "r"(b), "r"(c)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#define MAC16_16(c, a, b) (MAC16_16_armv5e(c, a, b))
|
||||
|
||||
/** 16x16 multiplication where the result fits in 32 bits */
|
||||
#undef MULT16_16
|
||||
static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
|
||||
{
|
||||
int res;
|
||||
__asm__(
|
||||
"#MULT16_16\n\t"
|
||||
"smulbb %0, %1, %2;\n"
|
||||
: "=r"(res)
|
||||
: "r"(a), "r"(b)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
|
||||
|
||||
#ifdef OPUS_ARM_INLINE_MEDIA
|
||||
|
||||
#undef SIG2WORD16
|
||||
static OPUS_INLINE opus_val16 SIG2WORD16_armv6(opus_val32 x)
|
||||
{
|
||||
celt_sig res;
|
||||
__asm__(
|
||||
"#SIG2WORD16\n\t"
|
||||
"ssat %0, #16, %1, ASR #12\n\t"
|
||||
: "=r"(res)
|
||||
: "r"(x+2048)
|
||||
);
|
||||
return EXTRACT16(res);
|
||||
}
|
||||
#define SIG2WORD16(x) (SIG2WORD16_armv6(x))
|
||||
|
||||
#endif /* OPUS_ARM_INLINE_MEDIA */
|
||||
|
||||
#endif
|
121
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/kiss_fft_armv4.h
vendored
Normal file
121
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/kiss_fft_armv4.h
vendored
Normal file
|
@ -0,0 +1,121 @@
|
|||
/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.*/
|
||||
|
||||
#ifndef KISS_FFT_ARMv4_H
|
||||
#define KISS_FFT_ARMv4_H
|
||||
|
||||
#if !defined(KISS_FFT_GUTS_H)
|
||||
#error "This file should only be included from _kiss_fft_guts.h"
|
||||
#endif
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
|
||||
#undef C_MUL
|
||||
#define C_MUL(m,a,b) \
|
||||
do{ \
|
||||
int br__; \
|
||||
int bi__; \
|
||||
int tt__; \
|
||||
__asm__ __volatile__( \
|
||||
"#C_MUL\n\t" \
|
||||
"ldrsh %[br], [%[bp], #0]\n\t" \
|
||||
"ldm %[ap], {r0,r1}\n\t" \
|
||||
"ldrsh %[bi], [%[bp], #2]\n\t" \
|
||||
"smull %[tt], %[mi], r1, %[br]\n\t" \
|
||||
"smlal %[tt], %[mi], r0, %[bi]\n\t" \
|
||||
"rsb %[bi], %[bi], #0\n\t" \
|
||||
"smull %[br], %[mr], r0, %[br]\n\t" \
|
||||
"mov %[tt], %[tt], lsr #15\n\t" \
|
||||
"smlal %[br], %[mr], r1, %[bi]\n\t" \
|
||||
"orr %[mi], %[tt], %[mi], lsl #17\n\t" \
|
||||
"mov %[br], %[br], lsr #15\n\t" \
|
||||
"orr %[mr], %[br], %[mr], lsl #17\n\t" \
|
||||
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
|
||||
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
|
||||
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
|
||||
: "r0", "r1" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#undef C_MUL4
|
||||
#define C_MUL4(m,a,b) \
|
||||
do{ \
|
||||
int br__; \
|
||||
int bi__; \
|
||||
int tt__; \
|
||||
__asm__ __volatile__( \
|
||||
"#C_MUL4\n\t" \
|
||||
"ldrsh %[br], [%[bp], #0]\n\t" \
|
||||
"ldm %[ap], {r0,r1}\n\t" \
|
||||
"ldrsh %[bi], [%[bp], #2]\n\t" \
|
||||
"smull %[tt], %[mi], r1, %[br]\n\t" \
|
||||
"smlal %[tt], %[mi], r0, %[bi]\n\t" \
|
||||
"rsb %[bi], %[bi], #0\n\t" \
|
||||
"smull %[br], %[mr], r0, %[br]\n\t" \
|
||||
"mov %[tt], %[tt], lsr #17\n\t" \
|
||||
"smlal %[br], %[mr], r1, %[bi]\n\t" \
|
||||
"orr %[mi], %[tt], %[mi], lsl #15\n\t" \
|
||||
"mov %[br], %[br], lsr #17\n\t" \
|
||||
"orr %[mr], %[br], %[mr], lsl #15\n\t" \
|
||||
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
|
||||
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
|
||||
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
|
||||
: "r0", "r1" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#undef C_MULC
|
||||
#define C_MULC(m,a,b) \
|
||||
do{ \
|
||||
int br__; \
|
||||
int bi__; \
|
||||
int tt__; \
|
||||
__asm__ __volatile__( \
|
||||
"#C_MULC\n\t" \
|
||||
"ldrsh %[br], [%[bp], #0]\n\t" \
|
||||
"ldm %[ap], {r0,r1}\n\t" \
|
||||
"ldrsh %[bi], [%[bp], #2]\n\t" \
|
||||
"smull %[tt], %[mr], r0, %[br]\n\t" \
|
||||
"smlal %[tt], %[mr], r1, %[bi]\n\t" \
|
||||
"rsb %[bi], %[bi], #0\n\t" \
|
||||
"smull %[br], %[mi], r1, %[br]\n\t" \
|
||||
"mov %[tt], %[tt], lsr #15\n\t" \
|
||||
"smlal %[br], %[mi], r0, %[bi]\n\t" \
|
||||
"orr %[mr], %[tt], %[mr], lsl #17\n\t" \
|
||||
"mov %[br], %[br], lsr #15\n\t" \
|
||||
"orr %[mi], %[br], %[mi], lsl #17\n\t" \
|
||||
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
|
||||
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
|
||||
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
|
||||
: "r0", "r1" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#endif /* FIXED_POINT */
|
||||
|
||||
#endif /* KISS_FFT_ARMv4_H */
|
118
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/kiss_fft_armv5e.h
vendored
Normal file
118
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/kiss_fft_armv5e.h
vendored
Normal file
|
@ -0,0 +1,118 @@
|
|||
/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.*/
|
||||
|
||||
#ifndef KISS_FFT_ARMv5E_H
|
||||
#define KISS_FFT_ARMv5E_H
|
||||
|
||||
#if !defined(KISS_FFT_GUTS_H)
|
||||
#error "This file should only be included from _kiss_fft_guts.h"
|
||||
#endif
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
|
||||
#if defined(__thumb__)||defined(__thumb2__)
|
||||
#define LDRD_CONS "Q"
|
||||
#else
|
||||
#define LDRD_CONS "Uq"
|
||||
#endif
|
||||
|
||||
#undef C_MUL
|
||||
#define C_MUL(m,a,b) \
|
||||
do{ \
|
||||
int mr1__; \
|
||||
int mr2__; \
|
||||
int mi__; \
|
||||
long long aval__; \
|
||||
int bval__; \
|
||||
__asm__( \
|
||||
"#C_MUL\n\t" \
|
||||
"ldrd %[aval], %H[aval], %[ap]\n\t" \
|
||||
"ldr %[bval], %[bp]\n\t" \
|
||||
"smulwb %[mi], %H[aval], %[bval]\n\t" \
|
||||
"smulwb %[mr1], %[aval], %[bval]\n\t" \
|
||||
"smulwt %[mr2], %H[aval], %[bval]\n\t" \
|
||||
"smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
|
||||
: [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
|
||||
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
|
||||
: [ap]LDRD_CONS(a), [bp]"m"(b) \
|
||||
); \
|
||||
(m).r = SHL32(SUB32(mr1__, mr2__), 1); \
|
||||
(m).i = SHL32(mi__, 1); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#undef C_MUL4
|
||||
#define C_MUL4(m,a,b) \
|
||||
do{ \
|
||||
int mr1__; \
|
||||
int mr2__; \
|
||||
int mi__; \
|
||||
long long aval__; \
|
||||
int bval__; \
|
||||
__asm__( \
|
||||
"#C_MUL4\n\t" \
|
||||
"ldrd %[aval], %H[aval], %[ap]\n\t" \
|
||||
"ldr %[bval], %[bp]\n\t" \
|
||||
"smulwb %[mi], %H[aval], %[bval]\n\t" \
|
||||
"smulwb %[mr1], %[aval], %[bval]\n\t" \
|
||||
"smulwt %[mr2], %H[aval], %[bval]\n\t" \
|
||||
"smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
|
||||
: [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
|
||||
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
|
||||
: [ap]LDRD_CONS(a), [bp]"m"(b) \
|
||||
); \
|
||||
(m).r = SHR32(SUB32(mr1__, mr2__), 1); \
|
||||
(m).i = SHR32(mi__, 1); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#undef C_MULC
|
||||
#define C_MULC(m,a,b) \
|
||||
do{ \
|
||||
int mr__; \
|
||||
int mi1__; \
|
||||
int mi2__; \
|
||||
long long aval__; \
|
||||
int bval__; \
|
||||
__asm__( \
|
||||
"#C_MULC\n\t" \
|
||||
"ldrd %[aval], %H[aval], %[ap]\n\t" \
|
||||
"ldr %[bval], %[bp]\n\t" \
|
||||
"smulwb %[mr], %[aval], %[bval]\n\t" \
|
||||
"smulwb %[mi1], %H[aval], %[bval]\n\t" \
|
||||
"smulwt %[mi2], %[aval], %[bval]\n\t" \
|
||||
"smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
|
||||
: [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
|
||||
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
|
||||
: [ap]LDRD_CONS(a), [bp]"m"(b) \
|
||||
); \
|
||||
(m).r = SHL32(mr__, 1); \
|
||||
(m).i = SHL32(SUB32(mi1__, mi2__), 1); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#endif /* FIXED_POINT */
|
||||
|
||||
#endif /* KISS_FFT_GUTS_H */
|
59
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/mdct_arm.h
vendored
Normal file
59
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/mdct_arm.h
vendored
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* Copyright (c) 2015 Xiph.Org Foundation
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file arm_mdct.h
|
||||
@brief ARM Neon Intrinsic optimizations for mdct using NE10 library
|
||||
*/
|
||||
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if !defined(MDCT_ARM_H)
|
||||
#define MDCT_ARM_H
|
||||
|
||||
#include "mdct.h"
|
||||
|
||||
#if defined(HAVE_ARM_NE10)
|
||||
/** Compute a forward MDCT and scale by 4/N, trashes the input array */
|
||||
void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 *window, int overlap,
|
||||
int shift, int stride, int arch);
|
||||
|
||||
void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
|
||||
kiss_fft_scalar * OPUS_RESTRICT out,
|
||||
const opus_val16 *window, int overlap,
|
||||
int shift, int stride, int arch);
|
||||
|
||||
#if !defined(OPUS_HAVE_RTCD)
|
||||
#define OVERRIDE_OPUS_MDCT (1)
|
||||
#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
|
||||
clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
|
||||
#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
|
||||
clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
|
||||
#endif /* OPUS_HAVE_RTCD */
|
||||
#endif /* HAVE_ARM_NE10 */
|
||||
|
||||
#endif
|
160
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/pitch_arm.h
vendored
Normal file
160
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/pitch_arm.h
vendored
Normal file
|
@ -0,0 +1,160 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation
|
||||
* Copyright (c) 2013 Parrot */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if !defined(PITCH_ARM_H)
|
||||
# define PITCH_ARM_H
|
||||
|
||||
# include "armcpu.h"
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
|
||||
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
|
||||
const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
|
||||
|
||||
# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
|
||||
# define OVERRIDE_CELT_INNER_PROD (1)
|
||||
# define OVERRIDE_DUAL_INNER_PROD (1)
|
||||
# define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
|
||||
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if !defined(OVERRIDE_CELT_INNER_PROD)
|
||||
# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
|
||||
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
|
||||
# define OVERRIDE_CELT_INNER_PROD (1)
|
||||
# define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
|
||||
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
# define OVERRIDE_CELT_INNER_PROD (1)
|
||||
# define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if !defined(OVERRIDE_DUAL_INNER_PROD)
|
||||
# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
|
||||
extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
|
||||
const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
|
||||
# define OVERRIDE_DUAL_INNER_PROD (1)
|
||||
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
|
||||
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
# define OVERRIDE_DUAL_INNER_PROD (1)
|
||||
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if defined(FIXED_POINT)
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON)
|
||||
opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
|
||||
opus_val32 *xcorr, int len, int max_pitch, int arch);
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
|
||||
# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
|
||||
opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
|
||||
opus_val32 *xcorr, int len, int max_pitch, int arch);
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_HAVE_RTCD) && \
|
||||
((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
|
||||
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
|
||||
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
|
||||
extern opus_val32
|
||||
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
|
||||
const opus_val16 *, opus_val32 *, int, int, int);
|
||||
# define OVERRIDE_PITCH_XCORR (1)
|
||||
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
|
||||
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
|
||||
xcorr, len, max_pitch, arch))
|
||||
|
||||
# elif defined(OPUS_ARM_PRESUME_EDSP) || \
|
||||
defined(OPUS_ARM_PRESUME_MEDIA) || \
|
||||
defined(OPUS_ARM_PRESUME_NEON)
|
||||
# define OVERRIDE_PITCH_XCORR (1)
|
||||
# define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
|
||||
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
void xcorr_kernel_neon_fixed(
|
||||
const opus_val16 *x,
|
||||
const opus_val16 *y,
|
||||
opus_val32 sum[4],
|
||||
int len);
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_HAVE_RTCD) && \
|
||||
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
|
||||
|
||||
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
|
||||
const opus_val16 *x,
|
||||
const opus_val16 *y,
|
||||
opus_val32 sum[4],
|
||||
int len);
|
||||
|
||||
# define OVERRIDE_XCORR_KERNEL (1)
|
||||
# define xcorr_kernel(x, y, sum, len, arch) \
|
||||
((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
|
||||
|
||||
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
# define OVERRIDE_XCORR_KERNEL (1)
|
||||
# define xcorr_kernel(x, y, sum, len, arch) \
|
||||
((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
|
||||
|
||||
# endif
|
||||
|
||||
#else /* Start !FIXED_POINT */
|
||||
/* Float case */
|
||||
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
|
||||
opus_val32 *xcorr, int len, int max_pitch, int arch);
|
||||
#endif
|
||||
|
||||
# if defined(OPUS_HAVE_RTCD) && \
|
||||
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
|
||||
extern void
|
||||
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
|
||||
const opus_val16 *, opus_val32 *, int, int, int);
|
||||
|
||||
# define OVERRIDE_PITCH_XCORR (1)
|
||||
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
|
||||
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
|
||||
xcorr, len, max_pitch, arch))
|
||||
|
||||
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
|
||||
# define OVERRIDE_PITCH_XCORR (1)
|
||||
# define celt_pitch_xcorr celt_pitch_xcorr_float_neon
|
||||
|
||||
# endif
|
||||
|
||||
#endif /* end !FIXED_POINT */
|
||||
|
||||
#endif
|
290
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/pitch_neon_intr.c
vendored
Normal file
290
Src/external_dependencies/openmpt-trunk/include/opus/celt/arm/pitch_neon_intr.c
vendored
Normal file
|
@ -0,0 +1,290 @@
|
|||
/***********************************************************************
|
||||
Copyright (c) 2017 Google Inc.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
- Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of Internet Society, IETF or IETF Trust, nor the
|
||||
names of specific contributors, may be used to endorse or promote
|
||||
products derived from this software without specific prior written
|
||||
permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
***********************************************************************/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "pitch.h"
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
|
||||
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy;
|
||||
int16x8_t x_s16x8, y_s16x8;
|
||||
int32x4_t xy_s32x4 = vdupq_n_s32(0);
|
||||
int64x2_t xy_s64x2;
|
||||
int64x1_t xy_s64x1;
|
||||
|
||||
for (i = 0; i < N - 7; i += 8) {
|
||||
x_s16x8 = vld1q_s16(&x[i]);
|
||||
y_s16x8 = vld1q_s16(&y[i]);
|
||||
xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
|
||||
xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
|
||||
}
|
||||
|
||||
if (N - i >= 4) {
|
||||
const int16x4_t x_s16x4 = vld1_s16(&x[i]);
|
||||
const int16x4_t y_s16x4 = vld1_s16(&y[i]);
|
||||
xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
xy_s64x2 = vpaddlq_s32(xy_s32x4);
|
||||
xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
|
||||
xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
|
||||
|
||||
for (; i < N; i++) {
|
||||
xy = MAC16_16(xy, x[i], y[i]);
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
celt_assert(celt_inner_prod_c(x, y, N) == xy);
|
||||
#endif
|
||||
|
||||
return xy;
|
||||
}
|
||||
|
||||
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
||||
int N, opus_val32 *xy1, opus_val32 *xy2)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy01, xy02;
|
||||
int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
|
||||
int32x4_t xy01_s32x4 = vdupq_n_s32(0);
|
||||
int32x4_t xy02_s32x4 = vdupq_n_s32(0);
|
||||
int64x2_t xy01_s64x2, xy02_s64x2;
|
||||
int64x1_t xy01_s64x1, xy02_s64x1;
|
||||
|
||||
for (i = 0; i < N - 7; i += 8) {
|
||||
x_s16x8 = vld1q_s16(&x[i]);
|
||||
y01_s16x8 = vld1q_s16(&y01[i]);
|
||||
y02_s16x8 = vld1q_s16(&y02[i]);
|
||||
xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
|
||||
xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
|
||||
xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
|
||||
xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
|
||||
}
|
||||
|
||||
if (N - i >= 4) {
|
||||
const int16x4_t x_s16x4 = vld1_s16(&x[i]);
|
||||
const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
|
||||
const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
|
||||
xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
|
||||
xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
|
||||
xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
|
||||
xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
|
||||
xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
|
||||
xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
|
||||
xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
|
||||
|
||||
for (; i < N; i++) {
|
||||
xy01 = MAC16_16(xy01, x[i], y01[i]);
|
||||
xy02 = MAC16_16(xy02, x[i], y02[i]);
|
||||
}
|
||||
*xy1 = xy01;
|
||||
*xy2 = xy02;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
{
|
||||
opus_val32 xy1_c, xy2_c;
|
||||
dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
|
||||
celt_assert(xy1_c == *xy1);
|
||||
celt_assert(xy2_c == *xy2);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#else /* !FIXED_POINT */
|
||||
|
||||
/* ========================================================================== */
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
|
||||
/* This part of code simulates floating-point NEON operations. */
|
||||
|
||||
/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
|
||||
/* operations of celt_inner_prod_neon(), and both functions should have bit */
|
||||
/* exact output. */
|
||||
static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
|
||||
for (i = 0; i < N - 3; i += 4) {
|
||||
xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
|
||||
xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
|
||||
xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
|
||||
xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
|
||||
}
|
||||
xy0 += xy2;
|
||||
xy1 += xy3;
|
||||
xy = xy0 + xy1;
|
||||
for (; i < N; i++) {
|
||||
xy = MAC16_16(xy, x[i], y[i]);
|
||||
}
|
||||
return xy;
|
||||
}
|
||||
|
||||
/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
|
||||
/* operations of dual_inner_prod_neon(), and both functions should have bit */
|
||||
/* exact output. */
|
||||
static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
||||
int N, opus_val32 *xy1, opus_val32 *xy2)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
|
||||
for (i = 0; i < N - 3; i += 4) {
|
||||
xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
|
||||
xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
|
||||
xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
|
||||
xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
|
||||
xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
|
||||
xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
|
||||
xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
|
||||
xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
|
||||
}
|
||||
xy01_0 += xy01_2;
|
||||
xy02_0 += xy02_2;
|
||||
xy01_1 += xy01_3;
|
||||
xy02_1 += xy02_3;
|
||||
xy01 = xy01_0 + xy01_1;
|
||||
xy02 = xy02_0 + xy02_1;
|
||||
for (; i < N; i++) {
|
||||
xy01 = MAC16_16(xy01, x[i], y01[i]);
|
||||
xy02 = MAC16_16(xy02, x[i], y02[i]);
|
||||
}
|
||||
*xy1 = xy01;
|
||||
*xy2 = xy02;
|
||||
}
|
||||
|
||||
#endif /* OPUS_CHECK_ASM */
|
||||
|
||||
/* ========================================================================== */
|
||||
|
||||
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy;
|
||||
float32x4_t xy_f32x4 = vdupq_n_f32(0);
|
||||
float32x2_t xy_f32x2;
|
||||
|
||||
for (i = 0; i < N - 7; i += 8) {
|
||||
float32x4_t x_f32x4, y_f32x4;
|
||||
x_f32x4 = vld1q_f32(&x[i]);
|
||||
y_f32x4 = vld1q_f32(&y[i]);
|
||||
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
||||
x_f32x4 = vld1q_f32(&x[i + 4]);
|
||||
y_f32x4 = vld1q_f32(&y[i + 4]);
|
||||
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
||||
}
|
||||
|
||||
if (N - i >= 4) {
|
||||
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
|
||||
const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
|
||||
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
|
||||
xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
|
||||
xy = vget_lane_f32(xy_f32x2, 0);
|
||||
|
||||
for (; i < N; i++) {
|
||||
xy = MAC16_16(xy, x[i], y[i]);
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
|
||||
#endif
|
||||
|
||||
return xy;
|
||||
}
|
||||
|
||||
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
||||
int N, opus_val32 *xy1, opus_val32 *xy2)
|
||||
{
|
||||
int i;
|
||||
opus_val32 xy01, xy02;
|
||||
float32x4_t xy01_f32x4 = vdupq_n_f32(0);
|
||||
float32x4_t xy02_f32x4 = vdupq_n_f32(0);
|
||||
float32x2_t xy01_f32x2, xy02_f32x2;
|
||||
|
||||
for (i = 0; i < N - 7; i += 8) {
|
||||
float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
|
||||
x_f32x4 = vld1q_f32(&x[i]);
|
||||
y01_f32x4 = vld1q_f32(&y01[i]);
|
||||
y02_f32x4 = vld1q_f32(&y02[i]);
|
||||
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
||||
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
||||
x_f32x4 = vld1q_f32(&x[i + 4]);
|
||||
y01_f32x4 = vld1q_f32(&y01[i + 4]);
|
||||
y02_f32x4 = vld1q_f32(&y02[i + 4]);
|
||||
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
||||
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
||||
}
|
||||
|
||||
if (N - i >= 4) {
|
||||
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
|
||||
const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
|
||||
const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
|
||||
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
||||
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
|
||||
xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
|
||||
xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
|
||||
xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
|
||||
xy01 = vget_lane_f32(xy01_f32x2, 0);
|
||||
xy02 = vget_lane_f32(xy02_f32x2, 0);
|
||||
|
||||
for (; i < N; i++) {
|
||||
xy01 = MAC16_16(xy01, x[i], y01[i]);
|
||||
xy02 = MAC16_16(xy02, x[i], y02[i]);
|
||||
}
|
||||
*xy1 = xy01;
|
||||
*xy2 = xy02;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
{
|
||||
opus_val32 xy1_c, xy2_c;
|
||||
dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
|
||||
celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
|
||||
celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* FIXED_POINT */
|
Loading…
Add table
Add a link
Reference in a new issue