diff --git a/lib/rbcodec/codecs/libopus/celt/celt.c b/lib/rbcodec/codecs/libopus/celt/celt.c
index 74ebee9..f1a1801 100644
--- a/lib/rbcodec/codecs/libopus/celt/celt.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@@ -448,12 +448,16 @@ static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X
          clt_mdct_backward(&mode->mdct, &X[b+c*N2*B], x+N2*b, mode->window, overlap, shortBlocks ? mode->maxLM : mode->maxLM-LM, B);
       }
 
-      for (j=0;j<overlap;j++)
-         out_mem[c][j] = x[j] + overlap_mem[c][j];
-      for (;j<N;j++)
-         out_mem[c][j] = x[j];
-      for (j=0;j<overlap;j++)
-         overlap_mem[c][j] = x[N+j];
+       /* overlap can be divided by 4 */
+      for (j=0;j<overlap;j+=4)
+    {
+    out_mem[c][j  ] = x[j  ] + overlap_mem[c][j  ];
+    out_mem[c][j+1] = x[j+1] + overlap_mem[c][j+1];
+    out_mem[c][j+2] = x[j+2] + overlap_mem[c][j+2];
+    out_mem[c][j+3] = x[j+3] + overlap_mem[c][j+3];
+    }
+    OPUS_COPY(out_mem[c]+overlap, x+overlap, N-overlap);
+    OPUS_COPY(overlap_mem[c]    , x+N      ,   overlap);
    } while (++c<C);
    RESTORE_STACK;
 }
@@ -497,43 +501,63 @@ static void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap)
 {
-   int i;
-   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
-   opus_val16 g00, g01, g02, g10, g11, g12;
-   static const opus_val16 gains[3][3] = {
-         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
-         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
-         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
-   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
-   for (i=0;i<overlap;i++)
-   {
-      opus_val16 f;
-      f = MULT16_16_Q15(window[i],window[i]);
-      y[i] = x[i]
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0-1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0+1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0-2])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0+2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x[i-T1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1-1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1+1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1-2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1+2]);
-
-   }
-   for (i=overlap;i<N;i++)
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x[i-T1])
-               + MULT16_32_Q15(g11,x[i-T1-1])
-               + MULT16_32_Q15(g11,x[i-T1+1])
-               + MULT16_32_Q15(g12,x[i-T1-2])
-               + MULT16_32_Q15(g12,x[i-T1+2]);
+    /* Multiply-adds are only needed if g0 or g1 are non-zero. In all other cases a simple
+     * copy of vector x to y is possible. */
+    if (g0!=0 || g1!=0)
+    {
+     int i;
+     opus_val16 g00, g01, g02, g10, g11, g12, idx0, idx1;
+     static const opus_val16 gains[3][3] = {
+       {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+       {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+       {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+     g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
+     g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
+     g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
+     g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
+     g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
+     g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+     /* printf("g0 %d g1 %d\n", g0,g1); */
+     idx0 = -T0;
+     idx1 = -T1;
+     for (i=0;i<overlap;i++,idx0++,idx1++)
+     {
+      opus_val16 f0, f1;
+      f1 = MULT16_16_Q15(window[i],window[i]);
+      f0 = Q15ONE - f1;
+      y[i] = x[i]
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g00), x[idx0  ])
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g01),(x[idx0-1]+x[idx0+1]))
+           + MULT16_32_Q15(MULT16_16_Q15(f0,g02),(x[idx0-2]+x[idx0+2]))
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g10), x[idx1  ])
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g11),(x[idx1-1]+x[idx1+1]))
+           + MULT16_32_Q15(MULT16_16_Q15(f1,g12),(x[idx1-2]+x[idx1+2]));
+     }
+     /* No multiply-add required if g1=0 as all multiplicants are =0. */
+     if (g1!=0)
+     {
+       idx1 = overlap-T1;
+       for (i=overlap;i<N;i++,idx1++)
+       {
+        y[i] = x[i]
+             + MULT16_32_Q15(g10, x[idx1  ])
+             + MULT16_32_Q15(g11,(x[idx1-1]+x[idx1+1]))
+             + MULT16_32_Q15(g12,(x[idx1-2]+x[idx1+2]));
+       }
+     }
+     /* Only perform vector copy if source and destination are not same. */
+     else if (x != y)
+     {
+      /* Copy part of vector from x[overlap..N] to y[overlap..N] */
+      OPUS_COPY(y+overlap, x+overlap, N-overlap);
+     }
+    }
+    /* Only perform vector copy if source and destination are not same. */
+    else if (x != y)
+    {
+    /* Copy full vector from x[0..N] to y[0..N] */
+    OPUS_COPY(y, x, N);
+    }
 }
 
 static const signed char tf_select_table[4][8] = {
diff --git a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
index 5682a67..0de6760 100644
--- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
+++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@@ -71,8 +71,21 @@ static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+#if defined(CPU_ARM)
+static inline int32_t MULT32_32_Q31(int32_t a, int32_t b)
+{
+  int32_t lo, hi;
+  asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
+               "mov %[lo], %[lo], lsr #31 \n\t"
+               "orr %[hi], %[lo], %[hi], lsl #1 \n\t"
+               : [lo] "=&r" (lo), [hi] "=&r" (hi)
+               : [a] "r" (a), [b] "r" (b) );
+  return(hi);
+}
+#else
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
-#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
+#define MULT32_32_Q31(a,b) (opus_val32)((((int64_t)(a)) * ((int64_t)(b)))>>31)
+#endif
 
 /** Compile-time conversion of float constant to 16-bit value */
 #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
diff --git a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
index 01049d5..5730639 100644
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
@@ -398,13 +398,12 @@ static void ki_bfly5(
    int i, u;
    kiss_fft_cpx scratch[13];
    const kiss_twiddle_cpx * twiddles = st->twiddles;
-   const kiss_twiddle_cpx *tw;
+   const kiss_twiddle_cpx *tw1,*tw2, *tw3, *tw4;
    kiss_twiddle_cpx ya,yb;
    kiss_fft_cpx * Fout_beg = Fout;
 
    ya = twiddles[fstride*m];
    yb = twiddles[fstride*2*m];
-   tw=st->twiddles;
 
    for (i=0;i<N;i++)
    {
@@ -415,13 +414,15 @@ static void ki_bfly5(
       Fout3=Fout0+3*m;
       Fout4=Fout0+4*m;
 
-      for ( u=0; u<m; ++u ) {
+    tw1 = tw2 = tw3 = tw4 = st->twiddles;
+    
+      for ( u=0; u<m; ++u) {
          scratch[0] = *Fout0;
 
-         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
-         C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
-         C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
-         C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
+     C_MULC(scratch[1] ,*Fout1, *tw1);
+     C_MULC(scratch[2] ,*Fout2, *tw2);
+     C_MULC(scratch[3] ,*Fout3, *tw3);
+     C_MULC(scratch[4] ,*Fout4, *tw4);
 
          C_ADD( scratch[7],scratch[1],scratch[4]);
          C_SUB( scratch[10],scratch[1],scratch[4]);
@@ -449,6 +450,10 @@ static void ki_bfly5(
          C_SUB(*Fout3,scratch[11],scratch[12]);
 
          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+     tw1 +=   fstride;
+     tw2 += 2*fstride;
+     tw3 += 3*fstride;
+     tw4 += 4*fstride;
       }
    }
 }
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 8fc1b54..54f9f8d 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -252,9 +252,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       const kiss_twiddle_scalar *t = &l->trig[0];
       for(i=0;i<N4;i++)
       {
-         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+         kiss_fft_scalar yr, yi, t0, t1;
+         t0 = t[i<<shift];
+         t1 = t[(N4-i)<<shift];
+         yr = -S_MUL(*xp2, t0) + S_MUL(*xp1,t1);
+         yi = -S_MUL(*xp2, t1) - S_MUL(*xp1,t0);
          /* works because the cos is nearly one */
          *yp++ = yr - S_MUL(yi,sine);
          *yp++ = yi + S_MUL(yr,sine);
@@ -273,12 +275,14 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
 
       for(i=0;i<N4;i++)
       {
-         kiss_fft_scalar re, im, yr, yi;
+         kiss_fft_scalar re, im, yr, yi, t0, t1;
+         t0 = t[i<<shift];
+         t1 = t[(N4-i)<<shift];
          re = fp[0];
          im = fp[1];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
-         yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
+         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yi = S_MUL(im,t0) + S_MUL(re,t1);
          /* works because the cos is nearly one */
          *fp++ = yr - S_MUL(yi,sine);
          *fp++ = yi + S_MUL(yr,sine);
@@ -305,12 +309,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2;
       const opus_val16 * OPUS_RESTRICT wp1 = window;
       const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp1 = *fp1;
-         xp1--;
-         fp1--;
-      }
+
+    i = N4-overlap/2;
+    xp1 -= N4-overlap/2;
+    fp1 -= N4-overlap/2;
+    OPUS_COPY(xp1+1, fp1+1, N4-overlap/2);
       for(; i < N4; i++)
       {
          kiss_fft_scalar x1;
@@ -327,12 +330,11 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2);
       const opus_val16 * OPUS_RESTRICT wp1 = window;
       const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp2 = *fp2;
-         xp2++;
-         fp2++;
-      }
+
+    i = N4-overlap/2;
+    OPUS_COPY(xp2, fp2, N4-overlap/2);
+    xp2 += N4-overlap/2;
+    fp2 += N4-overlap/2;
       for(; i < N4; i++)
       {
          kiss_fft_scalar x2;