diff --git a/lib/rbcodec/codecs/libopus/celt/celt.c b/lib/rbcodec/codecs/libopus/celt/celt.c
index d91b868..df8a10b 100644
--- a/lib/rbcodec/codecs/libopus/celt/celt.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@@ -415,6 +415,7 @@ static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS
}
#endif

+//AB static opus_val32 s_x[1080] IBSS_ATTR; /* 4320 byte */
/** Compute the IMDCT and apply window for all sub-frames and
all channels in a frame */
static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
@@ -425,6 +426,7 @@ static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X
const int N = mode->shortMdctSize<<LM;
const int overlap = OVERLAP(mode);
VARDECL(opus_val32, x);
+//AB opus_val32 *x = s_x;
SAVE_STACK;

ALLOC(x, N+overlap, opus_val32);
@@ -448,12 +450,16 @@ static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X
clt_mdct_backward(&mode->mdct, &X[b+c*N2*B], x+N2*b, mode->window, overlap, shortBlocks ? mode->maxLM : mode->maxLM-LM, B);
}

- for (j=0;j<overlap;j++)
- out_mem[c][j] = x[j] + overlap_mem[c][j];
- for (;j<N;j++)
- out_mem[c][j] = x[j];
- for (j=0;j<overlap;j++)
- overlap_mem[c][j] = x[N+j];
+ /* overlap can be divided by 4 */
+ for (j=0;j<overlap;j+=4)
+ {
+ out_mem[c][j ] = x[j ] + overlap_mem[c][j ];
+ out_mem[c][j+1] = x[j+1] + overlap_mem[c][j+1];
+ out_mem[c][j+2] = x[j+2] + overlap_mem[c][j+2];
+ out_mem[c][j+3] = x[j+3] + overlap_mem[c][j+3];
+ }
+ memcpy(out_mem[c]+overlap, x+overlap, (N-overlap)*sizeof(celt_sig));
+ memcpy(overlap_mem[c] , x+N , ( overlap)*sizeof(celt_sig));
} while (++c<C);
RESTORE_STACK;
}
@@ -497,43 +503,63 @@ static void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
const opus_val16 *window, int overlap)
{
- int i;
- /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
- opus_val16 g00, g01, g02, g10, g11, g12;
- static const opus_val16 gains[3][3] = {
- {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
- {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
- {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
- g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
- g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
- g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
- g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
- g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
- g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
- for (i=0;i<overlap;i++)
+ /* Multiply-adds are only needed if g0 or g1 or non-zero. In all other cases a simple
+ * copy of vector x to y is possible. */
+ if (g0!=0 || g1!=0)
{
- opus_val16 f;
- f = MULT16_16_Q15(window[i],window[i]);
- y[i] = x[i]
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0-1])
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0+1])
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0-2])
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0+2])
- + MULT16_32_Q15(MULT16_16_Q15(f,g10),x[i-T1])
- + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1-1])
- + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1+1])
- + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1-2])
- + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1+2]);
-
+ int i;
+ opus_val16 g00, g01, g02, g10, g11, g12, idx0, idx1;
+ static const opus_val16 gains[3][3] = {
+ {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+ {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+ {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+ g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
+ g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
+ g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
+ g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
+ g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
+ g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+ /* printf("g0 %d g1 %d\n", g0,g1); */
+ idx0 = -T0;
+ idx1 = -T1;
+ for (i=0;i<overlap;i++,idx0++,idx1++)
+ {
+ opus_val16 f0, f1;
+ f1 = MULT16_16_Q15(window[i],window[i]);
+ f0 = Q15ONE - f1;
+ y[i] = x[i]
+ + MULT16_32_Q15(MULT16_16_Q15(f0,g00), x[idx0 ])
+ + MULT16_32_Q15(MULT16_16_Q15(f0,g01),(x[idx0-1]+x[idx0+1]))
+ + MULT16_32_Q15(MULT16_16_Q15(f0,g02),(x[idx0-2]+x[idx0+2]))
+ + MULT16_32_Q15(MULT16_16_Q15(f1,g10), x[idx1 ])
+ + MULT16_32_Q15(MULT16_16_Q15(f1,g11),(x[idx1-1]+x[idx1+1]))
+ + MULT16_32_Q15(MULT16_16_Q15(f1,g12),(x[idx1-2]+x[idx1+2]));
+ }
+ /* No multiply-add required if g1=0 as all multiplicants are =0. */
+ if (g1!=0)
+ {
+ idx1 = overlap-T1;
+ for (i=overlap;i<N;i++,idx1++)
+ {
+ y[i] = x[i]
+ + MULT16_32_Q15(g10, x[idx1 ])
+ + MULT16_32_Q15(g11,(x[idx1-1]+x[idx1+1]))
+ + MULT16_32_Q15(g12,(x[idx1-2]+x[idx1+2]));
+ }
+ }
+ /* Only perform vector copy if source and destination are not same. */
+ else if (x != y)
+ {
+ /* Copy part of vector from x[overlap..N] to y[overlap..N] */
+ memcpy(y+overlap, x+overlap, (N-overlap)*sizeof(opus_val32));
+ }
+ }
+ /* Only perform vector copy if source and destination are not same. */
+ else if (x != y)
+ {
+ /* Copy full vector from x[0..N] to y[0..N] */
+ memcpy(y, x, (N)*sizeof(opus_val32));
}
- for (i=overlap;i<N;i++)
- y[i] = x[i]
- + MULT16_32_Q15(g10,x[i-T1])
- + MULT16_32_Q15(g11,x[i-T1-1])
- + MULT16_32_Q15(g11,x[i-T1+1])
- + MULT16_32_Q15(g12,x[i-T1-2])
- + MULT16_32_Q15(g12,x[i-T1+2]);
}

static const signed char tf_select_table[4][8] = {
@@ -2296,14 +2322,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
RESTORE_STACK;
}

+static celt_sig freq[1920] IBSS_ATTR; /* 7680 byte */
+static celt_norm X[1920] IBSS_ATTR; /* 3840 byte */
int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
{
int c, i, N;
int spread_decision;
opus_int32 bits;
ec_dec _dec;
- VARDECL(celt_sig, freq);
- VARDECL(celt_norm, X);
+//AB VARDECL(celt_sig, freq);
+//AB VARDECL(celt_norm, X);
VARDECL(celt_ener, bandE);
VARDECL(int, fine_quant);
VARDECL(int, pulses);
@@ -2398,8 +2426,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (effEnd > st->mode->effEBands)
effEnd = st->mode->effEBands;

- ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
- ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
+//AB ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
+//AB ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
+
ALLOC(bandE, st->mode->nbEBands*C, celt_ener);
c=0; do
for (i=0;i<M*st->mode->eBands[st->start];i++)
diff --git a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
index bc6820f..7e4f707 100644
--- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
+++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@@ -55,9 +55,9 @@ static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
}

#elif defined(CPU_ARM)
-static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
+static inline opus_val32 MULT16_32_Q15(opus_val32 a, opus_val32 b)
{
- int32_t lo, hi;
+ opus_val32 lo, hi;
asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
"mov %[lo], %[lo], lsr #15 \n\t"
"orr %[hi], %[lo], %[hi], lsl #17 \n\t"
@@ -71,8 +71,21 @@ static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
#endif

+#if defined(CPU_ARM)
+static inline opus_val32 MULT32_32_Q31(opus_val32 a, opus_val32 b)
+{
+ opus_val32 lo, hi;
+ asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
+ "mov %[lo], %[lo], lsr #31 \n\t"
+ "orr %[hi], %[lo], %[hi], lsl #1 \n\t"
+ : [lo] "=&r" (lo), [hi] "=&r" (hi)
+ : [a] "r" (a), [b] "r" (b) );
+ return(hi);
+}
+#else
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
-#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
+#define MULT32_32_Q31(a,b) (opus_val32)((((int64_t)(a)) * ((int64_t)(b)))>>31)
+#endif

/** Compile-time conversion of float constant to 16-bit value */
#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
diff --git a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
index 01049d5..15dd76a 100644
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
@@ -398,13 +398,12 @@ static void ki_bfly5(
int i, u;
kiss_fft_cpx scratch[13];
const kiss_twiddle_cpx * twiddles = st->twiddles;
- const kiss_twiddle_cpx *tw;
+ const kiss_twiddle_cpx *tw1,*tw2, *tw3, *tw4;
kiss_twiddle_cpx ya,yb;
kiss_fft_cpx * Fout_beg = Fout;

ya = twiddles[fstride*m];
yb = twiddles[fstride*2*m];
- tw=st->twiddles;

for (i=0;i<N;i++)
{
@@ -414,14 +413,15 @@ static void ki_bfly5(
Fout2=Fout0+2*m;
Fout3=Fout0+3*m;
Fout4=Fout0+4*m;
-
- for ( u=0; u<m; ++u ) {
+ tw1 = tw2 = tw3 = tw4 = st->twiddles;
+
+ for ( u=0; u<m; ++u) {
scratch[0] = *Fout0;
-
- C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
- C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
- C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
- C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+ C_MULC(scratch[1] ,*Fout1, *tw1);
+ C_MULC(scratch[2] ,*Fout2, *tw2);
+ C_MULC(scratch[3] ,*Fout3, *tw3);
+ C_MULC(scratch[4] ,*Fout4, *tw4);

C_ADD( scratch[7],scratch[1],scratch[4]);
C_SUB( scratch[10],scratch[1],scratch[4]);
@@ -449,6 +449,10 @@ static void ki_bfly5(
C_SUB(*Fout3,scratch[11],scratch[12]);

++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+ tw1 += fstride;
+ tw2 += 2*fstride;
+ tw3 += 3*fstride;
+ tw4 += 4*fstride;
}
}
}
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 15d2393..b3e0d28 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -208,6 +208,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
}
#endif

+//AB static kiss_fft_scalar f[1920>>1] IBSS_ATTR;
+static kiss_fft_scalar f2[1920>>1] IBSS_ATTR;
void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
{
@@ -215,7 +217,7 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
int N, N2, N4;
kiss_twiddle_scalar sine;
/* VARDECL(kiss_fft_scalar, f); */
- VARDECL(kiss_fft_scalar, f2);
+//AB VARDECL(kiss_fft_scalar, f2);
SAVE_STACK;
N = l->n; /* static modes => N = 1920 */
N >>= shift;
@@ -223,7 +225,7 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
N4 = N>>2;
/* ALLOC(f, N2, kiss_fft_scalar); */
kiss_fft_scalar f[N2]; /* worst case 3840b */
- ALLOC(f2, N2, kiss_fft_scalar);
+//AB ALLOC(f2, N2, kiss_fft_scalar);
/* sin(x) ~= x here */
#ifdef FIXED_POINT
sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
@@ -241,8 +243,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
- yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
- yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+ int idx0, idx1;
+ idx0 = i<<shift;
+ idx1 = (N4-i)<<shift;
+ yr = -S_MUL(*xp2, t[idx0]) + S_MUL(*xp1,t[idx1]);
+ yi = -S_MUL(*xp2, t[idx1]) - S_MUL(*xp1,t[idx0]);
/* works because the cos is nearly one */
*yp++ = yr - S_MUL(yi,sine);
*yp++ = yi + S_MUL(yr,sine);
@@ -262,11 +267,14 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
for(i=0;i<N4;i++)
{
kiss_fft_scalar re, im, yr, yi;
+ int idx0, idx1;
+ idx0 = i<<shift;
+ idx1 = (N4-i)<<shift;
re = fp[0];
im = fp[1];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
- yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
- yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
+ yr = S_MUL(re,t[idx0]) - S_MUL(im,t[idx1]);
+ yi = S_MUL(im,t[idx0]) + S_MUL(re,t[idx1]);
/* works because the cos is nearly one */
*fp++ = yr - S_MUL(yi,sine);
*fp++ = yi + S_MUL(yr,sine);
diff --git a/lib/rbcodec/codecs/libopus/opus_decoder.c b/lib/rbcodec/codecs/libopus/opus_decoder.c
index 7103b18..d7e3a66 100644
--- a/lib/rbcodec/codecs/libopus/opus_decoder.c
+++ b/lib/rbcodec/codecs/libopus/opus_decoder.c
@@ -131,6 +131,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels)
return OPUS_OK;
}

+static char sDec[26468] IBSS_ATTR;
OpusDecoder *opus_decoder_create(opus_int32 Fs, int channels, int *error)
{
int ret;
@@ -142,7 +143,8 @@ OpusDecoder *opus_decoder_create(opus_int32 Fs, int channels, int *error)
*error = OPUS_BAD_ARG;
return NULL;
}
- st = (OpusDecoder *)opus_alloc(opus_decoder_get_size(channels));
+ st = (OpusDecoder *)sDec;
+//AB st = (OpusDecoder *)opus_alloc(opus_decoder_get_size(channels));
if (st == NULL)
{
if (error)