1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
diff --git a/lib/rbcodec/codecs/libopus/celt/celt.c b/lib/rbcodec/codecs/libopus/celt/celt.c
index d91b868..df8a10b 100644
--- a/lib/rbcodec/codecs/libopus/celt/celt.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@@ -415,6 +415,7 @@ static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS
 }
 #endif
 
+//AB static opus_val32 s_x[1080] IBSS_ATTR; /* 4320 byte */
 /** Compute the IMDCT and apply window for all sub-frames and
     all channels in a frame */
 static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
@@ -425,6 +426,7 @@ static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X
    const int N = mode->shortMdctSize<<LM;
    const int overlap = OVERLAP(mode);
    VARDECL(opus_val32, x);
+//AB    opus_val32 *x = s_x;
    SAVE_STACK;
 
    ALLOC(x, N+overlap, opus_val32);
@@ -448,12 +450,16 @@ static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X
          clt_mdct_backward(&mode->mdct, &X[b+c*N2*B], x+N2*b, mode->window, overlap, shortBlocks ? mode->maxLM : mode->maxLM-LM, B);
       }
 
-      for (j=0;j<overlap;j++)
-         out_mem[c][j] = x[j] + overlap_mem[c][j];
-      for (;j<N;j++)
-         out_mem[c][j] = x[j];
-      for (j=0;j<overlap;j++)
-         overlap_mem[c][j] = x[N+j];
+    /* overlap can be divided by 4 */
+      for (j=0;j<overlap;j+=4)
+    {
+    out_mem[c][j  ] = x[j  ] + overlap_mem[c][j  ];
+    out_mem[c][j+1] = x[j+1] + overlap_mem[c][j+1];
+    out_mem[c][j+2] = x[j+2] + overlap_mem[c][j+2];
+    out_mem[c][j+3] = x[j+3] + overlap_mem[c][j+3];
+    }
+    memcpy(out_mem[c]+overlap, x+overlap, (N-overlap)*sizeof(celt_sig));
+    memcpy(overlap_mem[c]    , x+N      , (  overlap)*sizeof(celt_sig));
    } while (++c<C);
    RESTORE_STACK;
 }
@@ -497,43 +503,63 @@ static void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap)
 {
-   int i;
-   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
-   opus_val16 g00, g01, g02, g10, g11, g12;
-   static const opus_val16 gains[3][3] = {
-         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
-         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
-         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
-   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
-   for (i=0;i<overlap;i++)
+   /* Multiply-adds are only needed if g0 or g1 or non-zero. In all other cases a simple
+    * copy of vector x to y is possible. */
+   if (g0!=0 || g1!=0)
    {
-      opus_val16 f;
-      f = MULT16_16_Q15(window[i],window[i]);
-      y[i] = x[i]
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0-1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0+1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0-2])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0+2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x[i-T1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1-1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1+1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1-2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1+2]);
-
+     int i;
+     opus_val16 g00, g01, g02, g10, g11, g12, idx0, idx1;
+     static const opus_val16 gains[3][3] = {
+       {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+       {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+       {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+     g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
+     g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
+     g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
+     g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
+     g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
+     g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+     /* printf("g0 %d g1 %d\n", g0,g1); */
+     idx0 = -T0;
+     idx1 = -T1;
+     for (i=0;i<overlap;i++,idx0++,idx1++)
+     {
+      opus_val16 f0, f1;
+      f1 = MULT16_16_Q15(window[i],window[i]);
+      f0 = Q15ONE - f1;
+      y[i] = x[i]
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g00), x[idx0  ])
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g01),(x[idx0-1]+x[idx0+1]))
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g02),(x[idx0-2]+x[idx0+2]))
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g10), x[idx1  ])
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g11),(x[idx1-1]+x[idx1+1]))
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g12),(x[idx1-2]+x[idx1+2]));
+     }
+     /* No multiply-add required if g1=0 as all multiplicants are =0. */
+     if (g1!=0)
+     {
+       idx1 = overlap-T1;
+       for (i=overlap;i<N;i++,idx1++)
+       {
+        y[i] = x[i]
+             + MULT16_32_Q15(g10, x[idx1  ])
+             + MULT16_32_Q15(g11,(x[idx1-1]+x[idx1+1]))
+             + MULT16_32_Q15(g12,(x[idx1-2]+x[idx1+2]));
+       }
+     }
+     /* Only perform vector copy if source and destination are not same. */
+     else if (x != y)
+     {
+      /* Copy part of vector from x[overlap..N] to y[overlap..N] */
+      memcpy(y+overlap, x+overlap, (N-overlap)*sizeof(opus_val32));
+     }
+   }
+   /* Only perform vector copy if source and destination are not same. */
+   else if (x != y)
+   {
+    /* Copy full vector from x[0..N] to y[0..N] */ 
+    memcpy(y, x, (N)*sizeof(opus_val32));
    }
-   for (i=overlap;i<N;i++)
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x[i-T1])
-               + MULT16_32_Q15(g11,x[i-T1-1])
-               + MULT16_32_Q15(g11,x[i-T1+1])
-               + MULT16_32_Q15(g12,x[i-T1-2])
-               + MULT16_32_Q15(g12,x[i-T1+2]);
 }
 
 static const signed char tf_select_table[4][8] = {
@@ -2296,14 +2322,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
    RESTORE_STACK;
 }
 
+static celt_sig freq[1920] IBSS_ATTR; /* 7680 byte */
+static celt_norm X[1920] IBSS_ATTR; /* 3840 byte */
 int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
 {
    int c, i, N;
    int spread_decision;
    opus_int32 bits;
    ec_dec _dec;
-   VARDECL(celt_sig, freq);
-   VARDECL(celt_norm, X);
+//AB   VARDECL(celt_sig, freq);
+//AB   VARDECL(celt_norm, X);
    VARDECL(celt_ener, bandE);
    VARDECL(int, fine_quant);
    VARDECL(int, pulses);
@@ -2398,8 +2426,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    if (effEnd > st->mode->effEBands)
       effEnd = st->mode->effEBands;
 
-   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
-   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+//AB   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
+//AB   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
    ALLOC(bandE, st->mode->nbEBands*C, celt_ener);
    c=0; do
       for (i=0;i<M*st->mode->eBands[st->start];i++)
diff --git a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
index bc6820f..7e4f707 100644
--- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
+++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@@ -55,9 +55,9 @@ static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
 }
 
 #elif defined(CPU_ARM)
-static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
+static inline opus_val32 MULT16_32_Q15(opus_val32 a, opus_val32 b)
 {
-  int32_t lo, hi;
+  opus_val32 lo, hi;
   asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
                "mov %[lo], %[lo], lsr #15 \n\t"
                "orr %[hi], %[lo], %[hi], lsl #17 \n\t"
@@ -71,8 +71,21 @@ static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+#if defined(CPU_ARM)
+static inline opus_val32 MULT32_32_Q31(opus_val32 a, opus_val32 b)
+{
+  opus_val32 lo, hi;
+  asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
+               "mov %[lo], %[lo], lsr #31 \n\t"
+               "orr %[hi], %[lo], %[hi], lsl #1 \n\t"
+               : [lo] "=&r" (lo), [hi] "=&r" (hi)
+               : [a] "r" (a), [b] "r" (b) );
+  return(hi);
+}
+#else
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
-#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
+#define MULT32_32_Q31(a,b) (opus_val32)((((int64_t)(a)) * ((int64_t)(b)))>>31)
+#endif
 
 /** Compile-time conversion of float constant to 16-bit value */
 #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
diff --git a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
index 01049d5..15dd76a 100644
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
@@ -398,13 +398,12 @@ static void ki_bfly5(
    int i, u;
    kiss_fft_cpx scratch[13];
    const kiss_twiddle_cpx * twiddles = st->twiddles;
-   const kiss_twiddle_cpx *tw;
+   const kiss_twiddle_cpx *tw1,*tw2, *tw3, *tw4;
    kiss_twiddle_cpx ya,yb;
    kiss_fft_cpx * Fout_beg = Fout;
 
    ya = twiddles[fstride*m];
    yb = twiddles[fstride*2*m];
-   tw=st->twiddles;
 
    for (i=0;i<N;i++)
    {
@@ -414,14 +413,15 @@ static void ki_bfly5(
       Fout2=Fout0+2*m;
       Fout3=Fout0+3*m;
       Fout4=Fout0+4*m;
-
-      for ( u=0; u<m; ++u ) {
+    tw1 = tw2 = tw3 = tw4 = st->twiddles;
+    
+      for ( u=0; u<m; ++u) {
          scratch[0] = *Fout0;
-
-         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
-         C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
-         C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
-         C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
+   
+     C_MULC(scratch[1] ,*Fout1, *tw1);
+     C_MULC(scratch[2] ,*Fout2, *tw2);
+     C_MULC(scratch[3] ,*Fout3, *tw3);
+     C_MULC(scratch[4] ,*Fout4, *tw4);
 
          C_ADD( scratch[7],scratch[1],scratch[4]);
          C_SUB( scratch[10],scratch[1],scratch[4]);
@@ -449,6 +449,10 @@ static void ki_bfly5(
          C_SUB(*Fout3,scratch[11],scratch[12]);
 
          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+     tw1 +=   fstride;
+     tw2 += 2*fstride;
+     tw3 += 3*fstride;
+     tw4 += 4*fstride;
       }
    }
 }
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 15d2393..b3e0d28 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -208,6 +208,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
 }
 #endif
 
+//AB static kiss_fft_scalar f[1920>>1] IBSS_ATTR;
+static kiss_fft_scalar f2[1920>>1] IBSS_ATTR;
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
       const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
 {
@@ -215,7 +217,7 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    int N, N2, N4;
    kiss_twiddle_scalar sine;
 /*   VARDECL(kiss_fft_scalar, f); */
-   VARDECL(kiss_fft_scalar, f2);
+//AB   VARDECL(kiss_fft_scalar, f2);
    SAVE_STACK;
    N = l->n; /* static modes => N = 1920 */
    N >>= shift;
@@ -223,7 +225,7 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    N4 = N>>2;
 /*   ALLOC(f, N2, kiss_fft_scalar); */
    kiss_fft_scalar f[N2]; /* worst case 3840b */
-   ALLOC(f2, N2, kiss_fft_scalar);
+//AB   ALLOC(f2, N2, kiss_fft_scalar);
    /* sin(x) ~= x here */
 #ifdef FIXED_POINT
    sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
@@ -241,8 +243,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       for(i=0;i<N4;i++)
       {
          kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+     int idx0, idx1;
+     idx0 = i<<shift;
+     idx1 = (N4-i)<<shift;
+         yr = -S_MUL(*xp2, t[idx0]) + S_MUL(*xp1,t[idx1]);
+         yi = -S_MUL(*xp2, t[idx1]) - S_MUL(*xp1,t[idx0]);
          /* works because the cos is nearly one */
          *yp++ = yr - S_MUL(yi,sine);
          *yp++ = yi + S_MUL(yr,sine);
@@ -262,11 +267,14 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       for(i=0;i<N4;i++)
       {
          kiss_fft_scalar re, im, yr, yi;
+     int idx0, idx1;
+     idx0 = i<<shift;
+     idx1 = (N4-i)<<shift;
          re = fp[0];
          im = fp[1];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
-         yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
+         yr = S_MUL(re,t[idx0]) - S_MUL(im,t[idx1]);
+         yi = S_MUL(im,t[idx0]) + S_MUL(re,t[idx1]);
          /* works because the cos is nearly one */
          *fp++ = yr - S_MUL(yi,sine);
          *fp++ = yi + S_MUL(yr,sine);
diff --git a/lib/rbcodec/codecs/libopus/opus_decoder.c b/lib/rbcodec/codecs/libopus/opus_decoder.c
index 7103b18..d7e3a66 100644
--- a/lib/rbcodec/codecs/libopus/opus_decoder.c
+++ b/lib/rbcodec/codecs/libopus/opus_decoder.c
@@ -131,6 +131,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels)
    return OPUS_OK;
 }
 
+static char sDec[26468] IBSS_ATTR;
 OpusDecoder *opus_decoder_create(opus_int32 Fs, int channels, int *error)
 {
    int ret;
@@ -142,7 +143,8 @@ OpusDecoder *opus_decoder_create(opus_int32 Fs, int channels, int *error)
          *error = OPUS_BAD_ARG;
       return NULL;
    }
-   st = (OpusDecoder *)opus_alloc(opus_decoder_get_size(channels));
+   st = (OpusDecoder *)sDec;
+//AB   st = (OpusDecoder *)opus_alloc(opus_decoder_get_size(channels));
    if (st == NULL)
    {
       if (error)