Carefully written implementations of the 32-bit integer divide and modulus functions for ARM. These are still using a naive digit-by-digit algorithm, but the core loop has been carefully written.

llvm-svn: 127882
2011-03-18 16:35:02 +00:00 · 2011-03-18 16:35:02 +00:00 · 5abb5c14c4
parent 96a4bddefb
commit 5abb5c14c4
6 changed files with 337 additions and 32 deletions
--- a/compiler-rt/lib/arm/divmodsi4.S
+++ b/compiler-rt/lib/arm/divmodsi4.S
@ -0,0 +1,47 @@
+/*===-- divmodsi4.S - 32-bit signed integer divide and modulus ------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __divmodsi4 (32-bit signed integer divide and
+ * modulus) function for the ARM architecture.  A naive digit-by-digit
+ * computation is employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME    \
+    push   {r4-r7, lr}   ;\
+    add     r7,     sp, #12
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r4-r7, pc}
+
+.syntax unified
+.align 3
+DEFINE_COMPILERRT_FUNCTION(__divmodsi4)
+    ESTABLISH_FRAME
+//  Set aside the sign of the quotient and modulus, and the address for the
+//  modulus.
+    eor     r4,     r0, r1
+    mov     r5,     r0
+    mov     r6,     r2
+//  Take the absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
+    eor     ip,     r0, r0, asr #31
+    eor     lr,     r1, r1, asr #31
+    sub     r0,     ip, r0, asr #31
+    sub     r1,     lr, r1, asr #31
+//  Unsigned divmod:
+    bl      ___udivmodsi4
+//  Apply the sign of quotient and modulus
+    ldr     r1,    [r6]
+    eor     r0,     r0, r4, asr #31
+    sub     r0,     r0, r4, asr #31
+    eor     r1,     r1, r5, asr #31
+    sub     r1,     r1, r5, asr #31
+    str     r1,    [r6]
+    CLEAR_FRAME_AND_RETURN
--- a/compiler-rt/lib/arm/divsi3.S
+++ b/compiler-rt/lib/arm/divsi3.S
@ -0,0 +1,39 @@
+/*===-- divsi3.S - 32-bit signed integer divide ---------------------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __divsi3 (32-bit signed integer divide) function
+ * for the ARM architecture as a wrapper around the unsigned routine.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME \
+    push   {r4, r7, lr}    ;\
+    add     r7,     sp, #4
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r4, r7, pc}
+
+.syntax unified
+.align 3
+DEFINE_COMPILERRT_FUNCTION(__divsi3)
+    ESTABLISH_FRAME
+//  Set aside the sign of the quotient.
+    eor     r4,     r0, r1
+//  Take absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
+    eor     r2,     r0, r0, asr #31
+    eor     r3,     r1, r1, asr #31
+    sub     r0,     r2, r0, asr #31
+    sub     r1,     r3, r1, asr #31
+//  abs(a) / abs(b)
+    bl      ___udivsi3
+//  Apply sign of quotient to result and return.
+    eor     r0,     r0, r4, asr #31
+    sub     r0,     r0, r4, asr #31
+    CLEAR_FRAME_AND_RETURN
--- a/compiler-rt/lib/arm/modsi3.S
+++ b/compiler-rt/lib/arm/modsi3.S
@ -1,36 +1,39 @@
-//===-------- modsi3.S - Implement modsi3 ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
+/*===-- modsi3.S - 32-bit signed integer modulus --------------------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __modsi3 (32-bit signed integer modulus) function
+ * for the ARM architecture as a wrapper around the unsigned routine.
+ *
+ *===----------------------------------------------------------------------===*/

 #include "../assembly.h"

-//
-// extern int32_t __modsi3(int32_t a, int32_t b);
-//
-// Returns the remainder when dividing two 32-bit signed integers.
-// Conceptually, the function is: { return a - (a / b) * b; }
-// But if you write that in C, llvm compiles it to a call to __modsi3...
-//
-	.align 2
+#define ESTABLISH_FRAME \
+    push   {r4, r7, lr}    ;\
+    add     r7,     sp, #4
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r4, r7, pc}
+
+.syntax unified
+.align 3
 DEFINE_COMPILERRT_FUNCTION(__modsi3)
-	push	{r4, r5, r7, lr}
-	add	r7, sp, #8	// set stack frame
-	mov	r5, r0		// save a
-	mov	r4, r1		// save b
-	bl      ___divsi3	// compute a/b
-#if __ARM_ARCH_7A__
-	mls     r0, r4, r0, r5  // mulitple result * b and subtract from a
-#else	
-	// before armv7, does not have "mls" instruction
-	mul	r3, r0, r4	// multiple result * b
-	sub     r0, r5, r3      // a - result
-#endif
-	pop     {r4, r5, r7, pc}
-	
-
-
+    ESTABLISH_FRAME
+    //  Set aside the sign of the dividend.
+    mov     r4,     r0
+    //  Take absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
+    eor     r2,     r0, r0, asr #31
+    eor     r3,     r1, r1, asr #31
+    sub     r0,     r2, r0, asr #31
+    sub     r1,     r3, r1, asr #31
+    //  abs(a) % abs(b)
+    bl      ___umodsi3
+    //  Apply sign of dividend to result and return.
+    eor     r0,     r0, r4, asr #31
+    sub     r0,     r0, r4, asr #31
+    CLEAR_FRAME_AND_RETURN
--- a/compiler-rt/lib/arm/udivmodsi4.S
+++ b/compiler-rt/lib/arm/udivmodsi4.S
@ -0,0 +1,80 @@
+/*===-- udivmodsi4.S - 32-bit unsigned integer divide and modulus ---------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
+ * modulus) function for the ARM architecture.  A naive digit-by-digit
+ * computation is employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME    \
+    push   {r4, r7, lr}   ;\
+    add     r7,     sp, #4
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r4, r7, pc}
+    
+#define a r0
+#define b r1
+#define i r3
+#define r r4
+#define q ip
+#define one lr
+
+.syntax unified
+.align 3
+DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
+//  We use a simple digit by digit algorithm; before we get into the actual 
+//  divide loop, we must calculate the left-shift amount necessary to align
+//  the MSB of the divisor with that of the dividend (If this shift is
+//  negative, then the result is zero, and we early out). We also conjure a
+//  bit mask of 1 to use in constructing the quotient, and initialize the
+//  quotient to zero.
+    ESTABLISH_FRAME
+    clz     r4,     a
+    tst     b,      b   // detect divide-by-zero
+    clz     r3,     b
+    mov     q,      #0
+    beq     L_return    // return 0 if b is zero.
+    mov     one,    #1
+    subs    i,      r3, r4
+    blt     L_return    // return 0 if MSB(a) < MSB(b)
+
+L_mainLoop:
+//  This loop basically implements the following:
+//
+//  do {
+//      if (a >= b << i) {
+//          a -= b << i;
+//          q |= 1 << i;
+//          if (a == 0) break;
+//      }
+//  } while (--i)
+//
+//  Note that this does not perform the final iteration (i == 0); by doing it
+//  this way, we can merge the two branches which is a substantial win for
+//  such a tight loop on current ARM architectures.
+    subs    r,      a,  b, lsl i
+    orrhs   q,      q,one, lsl i
+    movhs   a,      r
+    subsne  i,      i, #1
+    bhi     L_mainLoop
+
+//  Do the final test subtraction and update of quotient (i == 0), as it is
+//  not performed in the main loop.
+    subs    r,      a,  b
+    orrhs   q,      #1
+    movhs   a,      r
+
+L_return:
+//  Store the remainder, and move the quotient to r0, then return.
+    str     a,     [r2]
+    mov     r0,     q
+    CLEAR_FRAME_AND_RETURN
--- a/compiler-rt/lib/arm/udivsi3.S
+++ b/compiler-rt/lib/arm/udivsi3.S
@ -0,0 +1,78 @@
+/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __udivsi3 (32-bit unsigned integer divide) 
+ * function for the ARM architecture.  A naive digit-by-digit computation is
+ * employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME \
+    push   {r7, lr}    ;\
+    mov     r7,     sp
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r7, pc}
+
+#define a r0
+#define b r1
+#define r r2
+#define i r3
+#define q ip
+#define one lr
+
+.syntax unified
+.align 3
+DEFINE_COMPILERRT_FUNCTION(__udivsi3)
+//  We use a simple digit by digit algorithm; before we get into the actual 
+//  divide loop, we must calculate the left-shift amount necessary to align
+//  the MSB of the divisor with that of the dividend (If this shift is
+//  negative, then the result is zero, and we early out). We also conjure a
+//  bit mask of 1 to use in constructing the quotient, and initialize the
+//  quotient to zero.
+    ESTABLISH_FRAME
+    clz     r2,     a
+    tst     b,      b   // detect divide-by-zero
+    clz     r3,     b
+    mov     q,      #0
+    beq     L_return    // return 0 if b is zero.
+    mov     one,    #1
+    subs    i,      r3, r2
+    blt     L_return    // return 0 if MSB(a) < MSB(b)
+
+L_mainLoop:
+//  This loop basically implements the following:
+//
+//  do {
+//      if (a >= b << i) {
+//          a -= b << i;
+//          q |= 1 << i;
+//          if (a == 0) break;
+//      }
+//  } while (--i)
+//
+//  Note that this does not perform the final iteration (i == 0); by doing it
+//  this way, we can merge the two branches which is a substantial win for
+//  such a tight loop on current ARM architectures.
+    subs    r,      a,  b, lsl i
+    orrhs   q,      q,one, lsl i
+    movhs   a,      r
+    subsne  i,      i, #1
+    bhi     L_mainLoop
+
+//  Do the final test subtraction and update of quotient (i == 0), as it is
+//  not performed in the main loop.
+    subs    r,      a,  b
+    orrhs   q,      #1
+
+L_return:
+//  Move the quotient to r0 and return.
+    mov     r0,     q
+    CLEAR_FRAME_AND_RETURN
--- a/compiler-rt/lib/arm/umodsi3.S
+++ b/compiler-rt/lib/arm/umodsi3.S
@ -0,0 +1,58 @@
+/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __umodsi3 (32-bit unsigned integer modulus) 
+ * function for the ARM architecture.  A naive digit-by-digit computation is
+ * employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define a r0
+#define b r1
+#define r r2
+#define i r3
+
+.syntax unified
+.align 3
+DEFINE_COMPILERRT_FUNCTION(__umodsi3)
+//  We use a simple digit by digit algorithm; before we get into the actual 
+//  divide loop, we must calculate the left-shift amount necessary to align
+//  the MSB of the divisor with that of the dividend.
+    clz     r2,     a
+    tst     b,      b       // detect b == 0
+    clz     r3,     b
+    bxeq    lr              // return a if b == 0
+    subs    i,      r3, r2
+    bxlt    lr              // return a if MSB(a) < MSB(b)
+
+L_mainLoop:
+//  This loop basically implements the following:
+//
+//  do {
+//      if (a >= b << i) {
+//          a -= b << i;
+//          if (a == 0) break;
+//      }
+//  } while (--i)
+//
+//  Note that this does not perform the final iteration (i == 0); by doing it
+//  this way, we can merge the two branches which is a substantial win for
+//  such a tight loop on current ARM architectures.
+    subs    r,      a,  b, lsl i
+    movhs   a,      r
+    subsne  i,      i, #1
+    bhi     L_mainLoop
+
+//  Do the final test subtraction and update of remainder (i == 0), as it is
+//  not performed in the main loop.
+    subs    r,      a,  b
+    movhs   a,      r
+    bx      lr