#include <config.h>

.macro ARM_UMOD32_BODY numerator, divisor, tmp, bits, div0label, return
.ifnc "", \div0label
tst \divisor, \divisor
beq \div0label
.endif
#if ARM_ARCH >= 5
clz \tmp, \numerator
clz \bits, \divisor
subs \bits, \bits, \tmp
/* If the highest set bit of divisor is higher than that of numerator,
skip straight to end, as numerator == result. */
.ifne \return
bxcc lr
.else
bcc 1f
.endif
rsbs \bits, \bits, #31
#else
mov \bits, #31
mov \tmp, \divisor
.set shift, 16
.rept 4
cmp \tmp, \numerator, lsr #shift
movls \tmp, \tmp, lsl #shift
subls \bits, \bits, #shift
.set shift, shift >> 1
.endr
/* Don't need to update the shifted divisor on the last pass. Also use subs
instead of sub. If cmp sets Z, the subs will be executed and set Z if
there are no passes to skip. Either way the PC add can be skipped safely
if Z is set. */
cmp \tmp, \numerator, lsr #1
sublss \bits, \bits, #1
#endif
addne pc, pc, \bits, lsl #3
nop
.set shift, 31
.rept 31
cmp \numerator, \divisor, lsl #shift
subcs \numerator, \numerator, \divisor, lsl #shift
.set shift, shift - 1
.endr
cmp \numerator, \divisor
subcs \numerator, \numerator, \divisor
.ifne \return
bx lr
.else
1:
.endif
.endm

.macro ARM_UDIV32_BODY numerator, divisor, quotient, remainder, tmp, bits, div0label, invertedlabel, inverteddivisor, return
#if ARM_ARCH >= 5
ARM_UDIV32_BODY_INNER \numerator, \divisor, \quotient, \remainder, \divisor, \bits, \div0label, \invertedlabel, \inverteddivisor, \return, \tmp
#else
ARM_UDIV32_BODY_INNER \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, \invertedlabel, \inverteddivisor, \return, \divisor
#endif
.endm

.macro ARM_UDIV32_BODY_INNER numerator, divisor, quotient, remainder, tmp, bits, div0label, invertedlabel, inverteddivisor, return, idivisor
/* When this is wrapped for signed division, the wrapper code will handle
inverting the divisor, and also the zero divisor test. */
.ifeq \inverteddivisor
.ifnc "", "\div0label"
rsbs \idivisor, \divisor, #0
beq \div0label
.else
rsb \idivisor, \divisor, #0
.endif
.else
.ifnc "", "\div0label"
tst \divisor, \divisor
beq \div0label
.endif
.endif
/* If the function using this macro expects to be wrapped for signed
division, a second entry point can be offered that skips the divisor
inversion and the zero divisor test. */
.ifnc "", "\invertedlabel"
\invertedlabel:
.endif
/* This SWAR divider requires a numerator less than 1<<31, because it must
be able to shift the remainder left at each step without shifting out
topmost bit. Since a shift might be needed for the aligned remainder to
exceed the divisor, the topmost bit must be unset at the start to avoid
this overflow case. The original numerator is saved so that the result
can be corrected after the reduced division completes. */
#if ARM_ARCH >= 5
clz \bits, \numerator
clz \divisor, \divisor
subs \bits, \divisor, \bits
bls 20f
movs \tmp, \numerator
submi \bits, \bits, #1
movmi \numerator, \numerator, lsr #1
bic \bits, \bits, \bits, asr #32
mov \idivisor, \idivisor, lsl \bits
rsb \bits, \bits, #30
#else
cmn \numerator, \idivisor
bls 20f
movs \tmp, \numerator
movmi \numerator, \numerator, lsr #1
mov \bits, #30
.set shift, 16
.rept 5
cmn \idivisor, \numerator, lsr #shift
subcs \bits, \bits, #shift
movcs \idivisor, \idivisor, lsl #shift
.set shift, shift >> 1
.endr
#endif
adds \numerator, \numerator, \idivisor
subcc \numerator, \numerator, \idivisor
add pc, pc, \bits, lsl #3
nop
.rept 30
adcs \numerator, \idivisor, \numerator, lsl #1
subcc \numerator, \numerator, \idivisor
.endr
adc \numerator, \numerator, \numerator
movs \tmp, \tmp, asr #1
rsb \bits, \bits, #31
bmi 10f
.ifc "", "\quotient"
mov \remainder, \numerator, lsr \bits
.else
.ifc "", "\remainder"
mov \idivisor, \numerator, lsr \bits
eor \quotient, \numerator, \idivisor, lsl \bits
.else
mov \remainder, \numerator, lsr \bits
eor \quotient, \numerator, \remainder, lsl \bits
.endif
.endif
.ifne \return
bx lr
.else
b 99f
.endif
10:
mov \tmp, \numerator, lsr \bits
eor \numerator, \numerator, \tmp, lsl \bits
sub \bits, \bits, #1
adc \tmp, \tmp, \tmp
adds \tmp, \tmp, \idivisor, asr \bits
.ifnc "", "\quotient"
adc \quotient, \numerator, \numerator
.endif
.ifnc "", "\remainder"
subcc \remainder, \tmp, \idivisor, asr \bits
movcs \remainder, \tmp
.endif
.ifne \return
bx lr
.else
b 99f
.endif
20:
/* The branch is reached on ARMv5+ based on the result of comparing leading
zero counts, so we still need to compare the actual values. */
#if ARM_ARCH >= 5
.ifnc "", "\remainder"
adds \remainder, \numerator, \idivisor
subcc \remainder, \remainder, \idivisor
.else
cmn \numerator, \idivisor
.endif
#else
.ifnc "", "\remainder"
movcs \remainder, #0
.ifnc "\remainder", "\numerator"
movcc \remainder, \numerator
.endif
.endif
#endif
.ifnc "", "\quotient"
movcs \quotient, #1
movcc \quotient, #0
.endif
.ifne \return
bx lr
.else
99:
.endif
.endm

.macro ARM_SDIV32_BODY numerator, divisor, quotient, remainder, tmp, sign, bits, div0label, return
/* tmp2[31] = divisor sign */
ands \sign, \divisor, #1<<31
/* We can invert the divisor here for ARMv4, and save an instruction if
there is no div0 handling. */
#if ARM_ARCH >= 5
rsbne \divisor, \divisor, #0
#else
rsbeq \divisor, \divisor, #0
#endif
/* tmp2[31] = result sign, tmp2[0:30], C = numerator sign */
eors \sign, \sign, \numerator, asr #32
rsbcs \numerator, \numerator, #0
#if ARM_ARCH >= 5
ARM_UDIV32_BODY \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, "", 0, 0
#else
ARM_UDIV32_BODY \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, "", 1, 0
#endif
movs \sign, \sign, lsl #1
.ifnc "", "\quotient"
rsbcs \quotient, \quotient, #0
.endif
.ifnc "", "\remainder"
rsbmi \remainder, \remainder, #0
.endif
.ifne \return
bx lr
.endif
.endm

#if ARM_ARCH >= 5
.macro ARM5_UDIV32_BODY numerator, divisor, quotient, remainder, bits, inv, neg, div0label, return
clz \bits, \divisor
mov \inv, \divisor, lsl \bits
cmp \inv, #1<<31
add \inv, pc, \inv, lsr #25
bls 20f
ldrb \inv, [\inv, #.L_udiv_est_table-.-64]
subs \bits, \bits, #7
rsb \neg, \divisor, #0
bmi 10f
mov \divisor, \inv, lsl \bits
mul \inv, \divisor, \neg
smlawt \divisor, \divisor, \inv, \divisor
mul \inv, \divisor, \neg
#if ARM_ARCH >= 6
smmla \divisor, \divisor, \inv, \divisor
#else
mov \bits, #0
smlal \bits, \divisor, \divisor, \inv
#endif
umull \bits, \divisor, \numerator, \divisor
add \numerator, \numerator, \neg
.ifc "", "\quotient"
mla \remainder, \divisor, \neg, \numerator
cmn \remainder, \neg
subcs \remainder, \neg
addpl \remainder, \remainder, \neg, lsl #1
.else
mla \inv, \divisor, \neg, \numerator
cmn \inv, \neg
mov \quotient, \divisor
addcc \quotient, \quotient, #1
addpl \quotient, \quotient, #2
.ifnc "", "\remainder"
mov \remainder, \inv
subcs \remainder, \neg
addpl \remainder, \remainder, \neg, lsl #1
.endif
.endif
.if \return
bx lr
.else
b 99f
.endif
10:
rsb \bits, \bits, #0
sub \inv, \inv, #4
mov \divisor, \inv, lsr \bits
umull \bits, \divisor, \divisor, \numerator
.ifc "", "\quotient"
mla \remainder, \divisor, \neg, \numerator
cmn \neg, \remainder, lsr #1
addcs \remainder, \neg, lsl #1
cmn \neg, \remainder
addcs \remainder, \neg
.else
mla \inv, \divisor, \neg, \numerator
mov \quotient, \divisor
cmn \neg, \inv, lsr #1
addcs \quotient, \quotient, #2
addcs \inv, \inv, \neg, lsl #1
cmn \inv, \neg
addcs \quotient, \quotient, #1
.ifnc "", "\remainder"
addcs \remainder, \inv, \neg
movcc \remainder, \inv
.endif
.endif
.if \return
bx lr
.else
b 99f
.endif
20:
.ifnc "", "\div0label"
bne \div0label
.endif
.ifnc "", "\remainder"
sub \divisor, \divisor, #1
.ifnc "", "\quotient"
rsb \bits, \bits, #31
.endif
and \remainder, \numerator, \divisor
.ifnc "", "\quotient"
mov \quotient, \numerator, lsr \bits
.endif
.else
rsb \bits, \bits, #31
mov \quotient, \numerator, lsr \bits
.endif
.if \return
bx lr
.endif
99:
.endm
#endif

.text
.align
.global udiv32_arm
.type udiv32_arm,%function
.global umod32_arm
.type umod32_arm,%function
.global umod32_arm_2
.type umod32_arm_2,%function
.global udivmod32_arm
.type udivmod32_arm,%function
.global sdiv32_arm
.type sdiv32_arm,%function
.global smod32_arm
.type smod32_arm,%function
.global sdivmod32_arm
.type sdivmod32_arm,%function

udiv32_arm:
ARM_UDIV32_BODY r0, r1, r0, "", r2, r3, "div0_wrap", "", 0, 1

umod32_arm:
ARM_UDIV32_BODY r0, r1, "", r0, r2, r3, "div0_wrap", "", 0, 1

umod32_arm_2:
ARM_UMOD32_BODY r0, r1, r2, r3, "div0_wrap", 1

udivmod32_arm:
ARM_UDIV32_BODY r0, r1, r0, r1, r2, r3, "div0_wrap", "", 0, 1

sdiv32_arm:
ARM_SDIV32_BODY r0, r1, r0, "", r2, r3, ip, "div0_wrap", 1

smod32_arm:
ARM_SDIV32_BODY r0, r1, "", r0, r2, r3, ip, "div0_wrap", 1

sdivmod32_arm:
ARM_SDIV32_BODY r0, r1, r0, r1, r2, r3, ip, "div0_wrap", 1

#if ARM_ARCH >= 5
.global udiv32_armv5
.type udiv32_armv5,%function
.global umod32_armv5
.type umod32_armv5,%function
.global udivmod32_armv5
.type udivmod32_armv5,%function
.global .L_udiv_est_table

udiv32_armv5:
ARM5_UDIV32_BODY r0, r1, r0, "", r2, r3, ip, div0_wrap, 1

umod32_armv5:
ARM5_UDIV32_BODY r0, r1, "", r0, r2, r3, ip, div0_wrap, 1

udivmod32_armv5:
ARM5_UDIV32_BODY r0, r1, r0, r1, r2, r3, ip, div0_wrap, 1

.L_udiv_est_table:
.byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
.byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
.byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
.byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
.byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
.byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
.byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
.byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
#endif

div0_wrap:
str lr, [sp, #-4]!
mov r0, #0
ldr pc, =__div0