diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
index c4aea14..d5fe6b6 100644
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -85,15 +85,40 @@
.global udiv32_arm
.type udiv32_arm,%function

+.set recip_max, 4096
+
udiv32_arm:
+#ifdef USE_IRAM
+ cmp r1, #recip_max
+ bhi .L_udiv
+ subs r2, r1, #3
+ bcc .L_udiv_tiny
+ adr r3, .L_udiv_recip_table
+ ldr r2, [r3, r2, lsl #2]
+ mov r3, r0
+ umull ip, r0, r2, r0
+ mul r2, r0, r1
+ cmp r3, r2
+ bxcs lr
+ sub r0, r0, #1
+ bx lr
+.L_udiv_tiny:
+ cmp r1, #1
+ movhi r0, r0, lsr #1
+ bxcs lr
+ b .L_div0
+#endif
+.L_udiv:
/* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
and add the next bit of the result. The correction code at .L_udiv32
does not need the divisor inverted, but can be modified to work with it,
and this allows the zero divisor test to be done early and without an
explicit comparison. */
rsbs r1, r1, #0
- beq .L_div0
tst r0, r0
+#ifndef USE_IRAM
+ beq .L_div0
+#endif
/* High bit must be unset, otherwise shift numerator right, calculate,
and correct results. As this case is very uncommon we want to avoid
any other delays on the main path in handling it, so the long divide
@@ -125,10 +150,37 @@ udiv32_arm:
.L_div0:
/* __div0 expects the calling address on the top of the stack */
stmdb sp!, { lr }
+ mov r0, #0
#if defined(__ARM_EABI__) || !defined(USE_IRAM)
bl __div0
#else
- mov lr, pc
- bx r3
+ ldr pc, [pc, #-4]
+ .word __div0
+#endif
+#ifdef USE_IRAM
+.L_udiv_recip_table:
+ .set div, 3
+ .rept recip_max - 2
+ .if (div - 1) & div
+ .set q, 0x40000000 / div
+ .set r, (0x40000000 - (q * div))<<1
+ .set q, q << 1
+ .if r >= div
+ .set q, q + 1
+ .set r, r - div
+ .endif
+ .set r, r << 1
+ .set q, q << 1
+ .if r >= div
+ .set q, q + 1
+ .set r, r - div
+ .endif
+ .set q, q + 1
+ .else
+ .set q, 0x40000000 / div * 4
+ .endif
+ .word q
+ .set div, div+1
+ .endr
#endif
.size udiv32_arm, . - udiv32_arm