diff --git a/asm_arm.h b/asm_arm.h
index 5ea2bf4..d0e249e 100644
--- a/asm_arm.h
+++ b/asm_arm.h
@@ -17,6 +17,12 @@
 
 #ifdef _ARM_ASSEM_
 
+#ifdef __thumb__
+# define THUMB(a) a
+#else
+# define THUMB(a)
+#endif
+
 /*
  * This should be used as a memory barrier, forcing all cached values in
  * registers to wr writen back to memory.  Might or might not be beneficial
@@ -207,7 +213,12 @@ void vect_mult_bw(ogg_int32_t *data, ogg_int32_t *window, int n)
 {
   while (n>=4) {
     asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
+#ifdef __thumb__
+                  "ldmda %[w], {r4, r5, r6, r7};"
+                  "add %[w], #16;"
+#else
                   "ldmda %[w]!, {r4, r5, r6, r7};"
+#endif
                   "smull r8, r9, r0, r7;"
                   "mov   r0, r9, lsl #1;"
                   "smull r8, r9, r1, r6;"
@@ -241,9 +252,11 @@ void vect_mult_bw(ogg_int32_t *data, ogg_int32_t *window, int n)
 static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) {
   int tmp;
   asm volatile("subs	%1, %0, #32768\n\t"
+THUMB(     "itt     pl\n\t")
 	       "movpl	%0, #0x7f00\n\t"
 	       "orrpl	%0, %0, #0xff\n"
 	       "adds	%1, %0, #32768\n\t"
+THUMB(     "it      mi\n\t")
 	       "movmi	%0, #0x8000"
 	       : "+r"(x),"=r"(tmp)
 	       :
@@ -264,18 +277,20 @@ static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
   ogg_uint32_t qi=*qip,pi=*pip;
   ogg_int32_t qexp=*qexpp;
 
-  asm("mov     r0,%3;"
+  asm("mov     r12,%3;"
       "movs    r1,%5,asr#1;"
-      "add     r0,r0,r1,lsl#3;"
+      "add     r12,r12,r1,lsl#3;"
       "beq 2f;\n"
       "1:"
       
-      "ldmdb   r0!,{r1,r3};"
+      "ldmdb   r12!,{r1,r3};"
       "subs    r1,r1,%4;"          //ilsp[j]-wi
+THUMB("it      mi;")
       "rsbmi   r1,r1,#0;"          //labs(ilsp[j]-wi)
       "umull   %0,r2,r1,%0;"       //qi*=labs(ilsp[j]-wi)
       
       "subs    r1,r3,%4;"          //ilsp[j+1]-wi
+THUMB("it      mi;")
       "rsbmi   r1,r1,#0;"          //labs(ilsp[j+1]-wi)
       "umull   %1,r3,r1,%1;"       //pi*=labs(ilsp[j+1]-wi)
       
@@ -287,22 +302,23 @@ static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
       "mov     %1,%1,lsr #16;"
       "orr     %1,%1,r3,lsl #16;"
       "0:"
-      "cmp     r0,%3;\n"
+      "cmp     r12,%3;\n"
       "bhi     1b;\n"
       
       "2:"
       // odd filter assymetry
-      "ands    r0,%5,#1;\n"
+      "ands    r12,%5,#1;\n"
       "beq     3f;\n"
-      "add     r0,%3,%5,lsl#2;\n"
+      "add     r12,%3,%5,lsl#2;\n"
       
-      "ldr     r1,[r0,#-4];\n"
-      "mov     r0,#0x4000;\n"
+      "ldr     r1,[r12,#-4];\n"
+      "mov     r12,#0x4000;\n"
       
       "subs    r1,r1,%4;\n"          //ilsp[j]-wi
+THUMB("it      mi;")
       "rsbmi   r1,r1,#0;\n"          //labs(ilsp[j]-wi)
       "umull   %0,r2,r1,%0;\n"       //qi*=labs(ilsp[j]-wi)
-      "umull   %1,r3,r0,%1;\n"       //pi*=labs(ilsp[j+1]-wi)
+      "umull   %1,r3,r12,%1;\n"       //pi*=labs(ilsp[j+1]-wi)
       
       "cmn     r2,r3;\n"             // shift down 16?
       "beq     3f;\n"
@@ -323,18 +339,23 @@ static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
       "mov     r2,#0;"
       "orr     r1,%0,%1;"
       "tst     r1,#0xff000000;"
+THUMB("itt     ne;")
       "addne   r2,r2,#8;"
       "movne   r1,r1,lsr #8;"
       "tst     r1,#0x00f00000;"
+THUMB("itt     ne;")
       "addne   r2,r2,#4;"
       "movne   r1,r1,lsr #4;"
       "tst     r1,#0x000c0000;"
+THUMB("itt     ne;")
       "addne   r2,r2,#2;"
       "movne   r1,r1,lsr #2;"
       "tst     r1,#0x00020000;"
+THUMB("itt     ne;")
       "addne   r2,r2,#1;"
       "movne   r1,r1,lsr #1;"
       "tst     r1,#0x00010000;"
+THUMB("it      ne;")
       "addne   r2,r2,#1;"
       "mov     %0,%0,lsr r2;"
       "mov     %1,%1,lsr r2;"
@@ -342,7 +363,7 @@ static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
       
       : "+r"(qi),"+r"(pi),"+r"(qexp)
       : "r"(ilsp),"r"(wi),"r"(m)
-      : "r0","r1","r2","r3","cc");
+      : "r1","r2","r3","r12","cc");
   
   *qip=qi;
   *pip=pi;
@@ -355,15 +376,19 @@ static inline void lsp_norm_asm(ogg_uint32_t *qip,ogg_int32_t *qexpp){
   ogg_int32_t qexp=*qexpp;
 
   asm("tst     %0,#0x0000ff00;"
+THUMB("itt     eq;")
       "moveq   %0,%0,lsl #8;"
       "subeq   %1,%1,#8;"
       "tst     %0,#0x0000f000;"
+THUMB("itt     eq;")
       "moveq   %0,%0,lsl #4;"
       "subeq   %1,%1,#4;"
       "tst     %0,#0x0000c000;"
+THUMB("itt     eq;")
       "moveq   %0,%0,lsl #2;"
       "subeq   %1,%1,#2;"
       "tst     %0,#0x00008000;"
+THUMB("itt     eq;")
       "moveq   %0,%0,lsl #1;"
       "subeq   %1,%1,#1;"
       : "+r"(qi),"+r"(qexp)