1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
#include <config.h>

.macro ARM_UMOD32_BODY numerator, divisor, tmp, bits, div0label, return
.ifnc "", \div0label
    tst \divisor, \divisor
    beq \div0label
.endif
#if ARM_ARCH >= 5
    clz     \tmp, \numerator
    clz     \bits, \divisor
    subs    \bits, \bits, \tmp
    /* If the highest set bit of divisor is higher than that of numerator,
       skip straight to end, as numerator == result. */
.ifne \return
    bxcc    lr
.else
    bcc     1f
.endif
    rsbs    \bits, \bits, #31
#else
    mov     \bits, #31
    mov     \tmp, \divisor
.set shift, 16
.rept 4
    cmp     \tmp, \numerator, lsr #shift
    movls   \tmp, \tmp, lsl #shift
    subls   \bits, \bits, #shift
.set shift, shift >> 1
.endr
    /* Don't need to update the shifted divisor on the last pass. Also use subs
       instead of sub. If cmp sets Z, the subs will be executed and set Z if
       there are no passes to skip. Either way the PC add can be skipped safely
       if Z is set. */
    cmp     \tmp, \numerator, lsr #1
    sublss  \bits, \bits, #1
#endif
    addne   pc, pc, \bits, lsl #3
    nop
.set shift, 31
.rept 31
    cmp      \numerator, \divisor, lsl #shift
    subcs    \numerator, \numerator, \divisor, lsl #shift
.set shift, shift - 1
.endr
    cmp     \numerator, \divisor
    subcs   \numerator, \numerator, \divisor
.ifne \return
    bx lr
.else
1:
.endif
.endm

.macro ARM_UDIV32_BODY numerator, divisor, quotient, remainder, tmp, bits, div0label, invertedlabel, inverteddivisor, return
#if ARM_ARCH >= 5
    ARM_UDIV32_BODY_INNER \numerator, \divisor, \quotient, \remainder, \divisor, \bits, \div0label, \invertedlabel, \inverteddivisor, \return, \tmp
#else
    ARM_UDIV32_BODY_INNER \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, \invertedlabel, \inverteddivisor, \return, \divisor
#endif
.endm

.macro ARM_UDIV32_BODY_INNER numerator, divisor, quotient, remainder, tmp, bits, div0label, invertedlabel, inverteddivisor, return, idivisor
    /* When this is wrapped for signed division, the wrapper code will handle
       inverting the divisor, and also the zero divisor test. */
.ifeq \inverteddivisor
.ifnc "", "\div0label"
    rsbs    \idivisor, \divisor, #0
    beq     \div0label
.else
    rsb     \idivisor, \divisor, #0
.endif
.else
.ifnc "",   "\div0label"
    tst     \divisor, \divisor
    beq     \div0label
.endif
.endif
    /* If the function using this macro expects to be wrapped for signed
       division, a second entry point can be offered that skips the divisor
       inversion and the zero divisor test. */
.ifnc "", "\invertedlabel"
\invertedlabel:
.endif
    /* This SWAR divider requires a numerator less than 1<<31, because it must
       be able to shift the remainder left at each step without shifting out
       topmost bit. Since a shift might be needed for the aligned remainder to
       exceed the divisor, the topmost bit must be unset at the start to avoid
       this overflow case. The original numerator is saved so that the result
       can be corrected after the reduced division completes. */
#if ARM_ARCH >= 5
    clz     \bits, \numerator
    clz     \divisor, \divisor
    subs    \bits, \divisor, \bits
    bls     20f
    movs    \tmp, \numerator
    submi   \bits, \bits, #1
    movmi   \numerator, \numerator, lsr #1
    bic     \bits, \bits, \bits, asr #32
    mov     \idivisor, \idivisor, lsl \bits
    rsb     \bits, \bits, #30
#else
    cmn     \numerator, \idivisor
    bls     20f
    movs    \tmp, \numerator
    movmi   \numerator, \numerator, lsr #1
    mov     \bits, #30
.set shift, 16
.rept 5
    cmn     \idivisor, \numerator, lsr #shift
    subcs   \bits, \bits, #shift
    movcs   \idivisor, \idivisor, lsl #shift
.set shift, shift >> 1
.endr
#endif
    adds    \numerator, \numerator, \idivisor
    subcc   \numerator, \numerator, \idivisor
    add     pc, pc, \bits, lsl #3
    nop
.rept 30
    adcs    \numerator, \idivisor, \numerator, lsl #1
    subcc   \numerator, \numerator, \idivisor
.endr
    adc     \numerator, \numerator, \numerator
    movs    \tmp, \tmp, asr #1
    rsb     \bits, \bits, #31
    bmi     10f
.ifc "", "\quotient"
    mov     \remainder, \numerator, lsr \bits
.else
.ifc "", "\remainder"
    mov     \idivisor, \numerator, lsr \bits
    eor     \quotient, \numerator, \idivisor, lsl \bits
.else
    mov     \remainder, \numerator, lsr \bits
    eor     \quotient, \numerator, \remainder, lsl \bits
.endif
.endif
.ifne \return        
    bx      lr
.else
    b       99f
.endif
10:
    mov     \tmp, \numerator, lsr \bits
    eor     \numerator, \numerator, \tmp, lsl \bits
    sub     \bits, \bits, #1
    adc     \tmp, \tmp, \tmp
    adds    \tmp, \tmp, \idivisor, asr \bits
.ifnc "", "\quotient"
    adc     \quotient, \numerator, \numerator
.endif
.ifnc "", "\remainder"
    subcc   \remainder, \tmp, \idivisor, asr \bits
    movcs   \remainder, \tmp
.endif
.ifne \return        
    bx      lr
.else
    b       99f
.endif
20:
    /* The branch is reached on ARMv5+ based on the result of comparing leading
       zero counts, so we still need to compare the actual values. */
#if ARM_ARCH >= 5
.ifnc "", "\remainder"
    adds    \remainder, \numerator, \idivisor
    subcc   \remainder, \remainder, \idivisor
.else
    cmn     \numerator, \idivisor
.endif
#else
.ifnc "", "\remainder"
    movcs   \remainder, #0
.ifnc "\remainder", "\numerator"
    movcc   \remainder, \numerator
.endif
.endif
#endif
.ifnc "", "\quotient"
    movcs   \quotient, #1
    movcc   \quotient, #0
.endif
.ifne \return        
    bx      lr
.else
99:
.endif
.endm

.macro ARM_SDIV32_BODY numerator, divisor, quotient, remainder, tmp, sign, bits, div0label, return
    /* tmp2[31] = divisor sign */
    ands    \sign, \divisor, #1<<31
    /* We can invert the divisor here for ARMv4, and save an instruction if
       there is no div0 handling. */
#if ARM_ARCH >= 5
    rsbne   \divisor, \divisor, #0
#else
    rsbeq   \divisor, \divisor, #0
#endif
    /* tmp2[31] = result sign, tmp2[0:30], C = numerator sign */
    eors    \sign, \sign, \numerator, asr #32
    rsbcs   \numerator, \numerator, #0
#if ARM_ARCH >= 5
    ARM_UDIV32_BODY \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, "", 0, 0
#else
    ARM_UDIV32_BODY \numerator, \divisor, \quotient, \remainder, \tmp, \bits, \div0label, "", 1, 0
#endif
    movs    \sign, \sign, lsl #1
.ifnc "", "\quotient"
    rsbcs   \quotient, \quotient, #0
.endif
.ifnc "", "\remainder"
    rsbmi   \remainder, \remainder, #0
.endif
.ifne \return
    bx lr
.endif
.endm

#if ARM_ARCH >= 5
.macro ARM5_UDIV32_BODY numerator, divisor, quotient, remainder, bits, inv, neg, div0label, return
    clz     \bits, \divisor
    mov     \inv, \divisor, lsl \bits
    cmp     \inv, #1<<31
    add     \inv, pc, \inv, lsr #25
    bls     20f
    ldrb    \inv, [\inv, #.L_udiv_est_table-.-64]
    subs    \bits, \bits, #7
    rsb     \neg, \divisor, #0
    bmi     10f
    mov     \divisor, \inv, lsl \bits
    mul     \inv, \divisor, \neg
    smlawt  \divisor, \divisor, \inv, \divisor
    mul     \inv, \divisor, \neg
#if ARM_ARCH >= 6
    smmla   \divisor, \divisor, \inv, \divisor
#else
    mov     \bits, #0
    smlal   \bits, \divisor, \divisor, \inv
#endif
    umull   \bits, \divisor, \numerator, \divisor
    add     \numerator, \numerator, \neg
.ifc "", "\quotient"
    mla     \remainder, \divisor, \neg, \numerator
    cmn     \remainder, \neg
    subcs   \remainder, \neg
    addpl   \remainder, \remainder, \neg, lsl #1
.else
    mla     \inv, \divisor, \neg, \numerator
    cmn     \inv, \neg
    mov     \quotient, \divisor
    addcc   \quotient, \quotient, #1
    addpl   \quotient, \quotient, #2
.ifnc "", "\remainder"
    mov     \remainder, \inv
    subcs   \remainder, \neg
    addpl   \remainder, \remainder, \neg, lsl #1
.endif
.endif
.if \return
    bx      lr
.else
    b       99f
.endif
10:
    rsb     \bits, \bits, #0
    sub     \inv, \inv, #4
    mov     \divisor, \inv, lsr \bits
    umull   \bits, \divisor, \divisor, \numerator
.ifc "", "\quotient"
    mla     \remainder, \divisor, \neg, \numerator
    cmn     \neg, \remainder, lsr #1
    addcs   \remainder, \neg, lsl #1
    cmn     \neg, \remainder
    addcs   \remainder, \neg
.else
    mla     \inv, \divisor, \neg, \numerator
    mov     \quotient, \divisor
    cmn     \neg, \inv, lsr #1
    addcs   \quotient, \quotient, #2
    addcs   \inv, \inv, \neg, lsl #1
    cmn     \inv, \neg
    addcs   \quotient, \quotient, #1
.ifnc "", "\remainder"
    addcs   \remainder, \inv, \neg
    movcc   \remainder, \inv
.endif
.endif
.if \return
    bx      lr
.else
    b       99f
.endif
20:
.ifnc "", "\div0label"
    bne     \div0label
.endif
.ifnc "", "\remainder"
    sub     \divisor, \divisor, #1
.ifnc "", "\quotient"
    rsb     \bits, \bits, #31
.endif
    and     \remainder, \numerator, \divisor
.ifnc "", "\quotient"
    mov     \quotient, \numerator, lsr \bits
.endif
.else
    rsb     \bits, \bits, #31
    mov     \quotient, \numerator, lsr \bits
.endif
.if \return
    bx      lr
.endif
99:
.endm
#endif    

    .text
    .align
    .global udiv32_arm
    .type   udiv32_arm,%function
    .global umod32_arm
    .type   umod32_arm,%function
    .global umod32_arm_2
    .type   umod32_arm_2,%function
    .global udivmod32_arm
    .type   udivmod32_arm,%function
    .global sdiv32_arm
    .type   sdiv32_arm,%function
    .global smod32_arm
    .type   smod32_arm,%function
    .global sdivmod32_arm
    .type   sdivmod32_arm,%function

udiv32_arm:
    ARM_UDIV32_BODY r0, r1, r0, "", r2, r3, "div0_wrap", "", 0, 1

umod32_arm:
    ARM_UDIV32_BODY r0, r1, "", r0, r2, r3, "div0_wrap", "", 0, 1

umod32_arm_2:
    ARM_UMOD32_BODY r0, r1, r2, r3, "div0_wrap", 1

udivmod32_arm:
    ARM_UDIV32_BODY r0, r1, r0, r1, r2, r3, "div0_wrap", "", 0, 1

sdiv32_arm:
    ARM_SDIV32_BODY r0, r1, r0, "", r2, r3, ip, "div0_wrap", 1

smod32_arm:
    ARM_SDIV32_BODY r0, r1, "", r0, r2, r3, ip, "div0_wrap", 1

sdivmod32_arm:
    ARM_SDIV32_BODY r0, r1, r0, r1, r2, r3, ip, "div0_wrap", 1

#if ARM_ARCH >= 5
    .global udiv32_armv5
    .type   udiv32_armv5,%function
    .global umod32_armv5
    .type   umod32_armv5,%function
    .global udivmod32_armv5
    .type   udivmod32_armv5,%function
    .global .L_udiv_est_table

udiv32_armv5:
    ARM5_UDIV32_BODY r0, r1, r0, "", r2, r3, ip, div0_wrap, 1

umod32_armv5:
    ARM5_UDIV32_BODY r0, r1, "", r0, r2, r3, ip, div0_wrap, 1

udivmod32_armv5:
    ARM5_UDIV32_BODY r0, r1, r0, r1, r2, r3, ip, div0_wrap, 1

.L_udiv_est_table:
    .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
    .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
    .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
    .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
    .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
    .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
#endif

div0_wrap:
    str     lr, [sp, #-4]!
    mov     r0, #0
    ldr     pc, =__div0