1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* perform 32x32->40 unsigned multiply, round off and return top 8 bits */
static inline uint32_t sc_mul_u32_rnd(uint32_t m, uint32_t n)
{
    unsigned r, t1, t2, t3;
    unsigned h = 1 << 15;
    /* notation:
       m = ab, n = cd
       final result is (((a *c) << 32) + ((b * c + a * d) << 16) + b * d +
            (1 << 31)) >> 32
    */
    asm (
        "swap.w  %[m], %[t1]\n\t" /* t1 = ba */
        "mulu    %[m], %[n]\n\t" /* b * d */
        "swap.w  %[n], %[t3]\n\t" /* t3 = dc */
        "sts     macl, %[r]\n\t" /* r = b * d */
        "mulu    %[m], %[t3]\n\t" /* b * c */
        "shlr16  %[r]\n\t"
        "sts     macl, %[t2]\n\t" /* t2 = b * c */
        "mulu    %[t1], %[t3]\n\t" /* a * c */
        "add     %[t2], %[r]\n\t"
        "sts     macl, %[t3]\n\t" /* t3 = a * c */
        "mulu    %[t1], %[n]\n\t" /* a * d */
        "shll16  %[t3]\n\t"
        "sts     macl, %[t2]\n\t" /* t2 = a * d */
        "add     %[t2], %[r]\n\t"
        "add     %[t2], %[r]\n\t" /* r = ((b * d) >> 16) + (b * c + a * d) +
                                         ((a * c) << 16) */
        "add     %[h], %[r]\n\t" /* round result */
        "shlr16  %[r]\n\t" /* truncate result */
        : /* outputs */
        [r] "=&r"(r),
        [t1]"=&r"(t1),
        [t2]"=&r"(t2),
        [t3]"=&r"(t3)
        : /* inputs */
        [h] "r"  (h),
        [m] "r"  (m),
        [n] "r"  (n)
    );
    return r;
}