1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
diff --git a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
index b1fe8fb..f387b40 100644
--- a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
+++ b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
@@ -57,6 +57,23 @@
 
 #   define S_MUL(a,b) MULT16_32_Q15(b, a)
 
+#if defined(CPU_ARM)
+static inline int32_t S_MULADD(int32_t a, int32_t b, int32_t c, int32_t d)
+{
+  int32_t lo, hi;
+  asm volatile("smull %[lo], %[hi], %[a], %[b]     \n\t"
+               "smlal %[lo], %[hi], %[c], %[d]     \n\t"
+               "mov %[lo], %[lo], lsr #15          \n\t"
+               "orr %[hi], %[lo], %[hi], lsl #17   \n\t"
+               : [lo] "=&r" (lo), [hi] "=&r" (hi)
+               : [a] "r" (a), [b] "r" (b), [c] "r" (c), [d] "r" (d) );
+  return(hi);
+}
+#else
+#   define S_MULADD(a,b,c,d) \
+      ADD32(S_MUL((a),(b)) , S_MUL((c),(d)));
+#endif
+
 #   define C_MUL(m,a,b) \
       do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
           (m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
@@ -132,11 +149,10 @@
 #define C_ADDTO( res , a)\
     do {(res).r = ADD32((res).r, (a).r);  (res).i = ADD32((res).i,(a).i);\
     }while(0)
-
 #define C_SUBFROM( res , a)\
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
-
+    
 #else  /* not FIXED_POINT*/
 
 #   define S_MUL(a,b) ( (a)*(b) )
diff --git a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
index 01049d5..a0d7ab9 100644
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
@@ -396,7 +396,7 @@ static void ki_bfly5(
 {
    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
    int i, u;
-   kiss_fft_cpx scratch[13];
+   kiss_fft_cpx scratch[12];
    const kiss_twiddle_cpx * twiddles = st->twiddles;
    const kiss_twiddle_cpx *tw;
    kiss_twiddle_cpx ya,yb;
@@ -416,37 +416,33 @@ static void ki_bfly5(
       Fout4=Fout0+4*m;
 
       for ( u=0; u<m; ++u ) {
-         scratch[0] = *Fout0;
-
-         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MULC(scratch[1] ,*Fout1, tw[  u*fstride]);
          C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
          C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
          C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
 
-         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_ADD( scratch[ 7],scratch[1],scratch[4]);
          C_SUB( scratch[10],scratch[1],scratch[4]);
-         C_ADD( scratch[8],scratch[2],scratch[3]);
-         C_SUB( scratch[9],scratch[2],scratch[3]);
-
+         C_ADD( scratch[ 8],scratch[2],scratch[3]);
+         C_SUB( scratch[ 9],scratch[2],scratch[3]);
+
+         scratch[ 5].r = Fout0->r + S_MULADD(scratch[7].r,ya.r,scratch[8].r,yb.r);
+         scratch[ 5].i = Fout0->i + S_MULADD(scratch[7].i,ya.r,scratch[8].i,yb.r);
+         scratch[11].r = Fout0->r + S_MULADD(scratch[7].r,yb.r,scratch[8].r,ya.r);
+         scratch[11].i = Fout0->i + S_MULADD(scratch[7].i,yb.r,scratch[8].i,ya.r);
+                   
          Fout0->r += scratch[7].r + scratch[8].r;
          Fout0->i += scratch[7].i + scratch[8].i;
 
-         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
-         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
-
-         scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
-         scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
-
-         C_SUB(*Fout1,scratch[5],scratch[6]);
-         C_ADD(*Fout4,scratch[5],scratch[6]);
-
-         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
-         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
-         scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
-         scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
-
-         C_ADD(*Fout2,scratch[11],scratch[12]);
-         C_SUB(*Fout3,scratch[11],scratch[12]);
+         scratch[6].r = -S_MULADD( scratch[10].i, ya.i, scratch[9].i, yb.i);
+         scratch[6].i =  S_MULADD( scratch[10].r, ya.i, scratch[9].r, yb.i);
+         scratch[0].r =  S_MULADD( scratch[10].i, yb.i,-scratch[9].i, ya.i);
+         scratch[0].i =  S_MULADD(-scratch[10].r, yb.i, scratch[9].r, ya.i);
+         
+         C_ADD(*Fout4,scratch[ 5],scratch[6]);
+         C_SUB(*Fout1,scratch[ 5],scratch[6]);
+         C_ADD(*Fout2,scratch[11],scratch[0]);
+         C_SUB(*Fout3,scratch[11],scratch[0]);
 
          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
       }