Carefully written implementations of the 32-bit integer divide and modulus functions for ARM. These are still using a naive digit-by-digit algorithm, but the core loop has been carefully written.

llvm-svn: 127882
This commit is contained in:
Stephen Canon 2011-03-18 16:35:02 +00:00
parent 96a4bddefb
commit 5abb5c14c4
6 changed files with 337 additions and 32 deletions

View File

@ -0,0 +1,47 @@
/*===-- divmodsi4.S - 32-bit signed integer divide and modulus ------------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __divmodsi4 (32-bit signed integer divide and
* modulus) function for the ARM architecture. A naive digit-by-digit
* computation is employed for simplicity.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
#define ESTABLISH_FRAME \
push {r4-r7, lr} ;\
add r7, sp, #12
#define CLEAR_FRAME_AND_RETURN \
pop {r4-r7, pc}
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__divmodsi4)
ESTABLISH_FRAME
// Set aside the sign of the quotient and modulus, and the address for the
// modulus.
eor r4, r0, r1
mov r5, r0
mov r6, r2
// Take the absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
eor ip, r0, r0, asr #31
eor lr, r1, r1, asr #31
sub r0, ip, r0, asr #31
sub r1, lr, r1, asr #31
// Unsigned divmod:
bl ___udivmodsi4
// Apply the sign of quotient and modulus
ldr r1, [r6]
eor r0, r0, r4, asr #31
sub r0, r0, r4, asr #31
eor r1, r1, r5, asr #31
sub r1, r1, r5, asr #31
str r1, [r6]
CLEAR_FRAME_AND_RETURN

View File

@ -0,0 +1,39 @@
/*===-- divsi3.S - 32-bit signed integer divide ---------------------------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __divsi3 (32-bit signed integer divide) function
* for the ARM architecture as a wrapper around the unsigned routine.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
#define ESTABLISH_FRAME \
push {r4, r7, lr} ;\
add r7, sp, #4
#define CLEAR_FRAME_AND_RETURN \
pop {r4, r7, pc}
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__divsi3)
ESTABLISH_FRAME
// Set aside the sign of the quotient.
eor r4, r0, r1
// Take absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
eor r2, r0, r0, asr #31
eor r3, r1, r1, asr #31
sub r0, r2, r0, asr #31
sub r1, r3, r1, asr #31
// abs(a) / abs(b)
bl ___udivsi3
// Apply sign of quotient to result and return.
eor r0, r0, r4, asr #31
sub r0, r0, r4, asr #31
CLEAR_FRAME_AND_RETURN

View File

@ -1,36 +1,39 @@
//===-------- modsi3.S - Implement modsi3 ---------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/*===-- modsi3.S - 32-bit signed integer modulus --------------------------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __modsi3 (32-bit signed integer modulus) function
* for the ARM architecture as a wrapper around the unsigned routine.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
//
// extern int32_t __modsi3(int32_t a, int32_t b);
//
// Returns the remainder when dividing two 32-bit signed integers.
// Conceptually, the function is: { return a - (a / b) * b; }
// But if you write that in C, llvm compiles it to a call to __modsi3...
//
.align 2
#define ESTABLISH_FRAME \
push {r4, r7, lr} ;\
add r7, sp, #4
#define CLEAR_FRAME_AND_RETURN \
pop {r4, r7, pc}
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__modsi3)
push {r4, r5, r7, lr}
add r7, sp, #8 // set stack frame
mov r5, r0 // save a
mov r4, r1 // save b
bl ___divsi3 // compute a/b
#if __ARM_ARCH_7A__
mls r0, r4, r0, r5 // mulitple result * b and subtract from a
#else
// before armv7, does not have "mls" instruction
mul r3, r0, r4 // multiple result * b
sub r0, r5, r3 // a - result
#endif
pop {r4, r5, r7, pc}
ESTABLISH_FRAME
// Set aside the sign of the dividend.
mov r4, r0
// Take absolute value of a and b via abs(x) = (x^(x >> 31)) - (x >> 31).
eor r2, r0, r0, asr #31
eor r3, r1, r1, asr #31
sub r0, r2, r0, asr #31
sub r1, r3, r1, asr #31
// abs(a) % abs(b)
bl ___umodsi3
// Apply sign of dividend to result and return.
eor r0, r0, r4, asr #31
sub r0, r0, r4, asr #31
CLEAR_FRAME_AND_RETURN

View File

@ -0,0 +1,80 @@
/*===-- udivmodsi4.S - 32-bit unsigned integer divide and modulus ---------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __udivmodsi4 (32-bit unsigned integer divide and
* modulus) function for the ARM architecture. A naive digit-by-digit
* computation is employed for simplicity.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
#define ESTABLISH_FRAME \
push {r4, r7, lr} ;\
add r7, sp, #4
#define CLEAR_FRAME_AND_RETURN \
pop {r4, r7, pc}
#define a r0
#define b r1
#define i r3
#define r r4
#define q ip
#define one lr
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
// We use a simple digit by digit algorithm; before we get into the actual
// divide loop, we must calculate the left-shift amount necessary to align
// the MSB of the divisor with that of the dividend (If this shift is
// negative, then the result is zero, and we early out). We also conjure a
// bit mask of 1 to use in constructing the quotient, and initialize the
// quotient to zero.
ESTABLISH_FRAME
clz r4, a
tst b, b // detect divide-by-zero
clz r3, b
mov q, #0
beq L_return // return 0 if b is zero.
mov one, #1
subs i, r3, r4
blt L_return // return 0 if MSB(a) < MSB(b)
L_mainLoop:
// This loop basically implements the following:
//
// do {
// if (a >= b << i) {
// a -= b << i;
// q |= 1 << i;
// if (a == 0) break;
// }
// } while (--i)
//
// Note that this does not perform the final iteration (i == 0); by doing it
// this way, we can merge the two branches which is a substantial win for
// such a tight loop on current ARM architectures.
subs r, a, b, lsl i
orrhs q, q,one, lsl i
movhs a, r
subsne i, i, #1
bhi L_mainLoop
// Do the final test subtraction and update of quotient (i == 0), as it is
// not performed in the main loop.
subs r, a, b
orrhs q, #1
movhs a, r
L_return:
// Store the remainder, and move the quotient to r0, then return.
str a, [r2]
mov r0, q
CLEAR_FRAME_AND_RETURN

View File

@ -0,0 +1,78 @@
/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __udivsi3 (32-bit unsigned integer divide)
* function for the ARM architecture. A naive digit-by-digit computation is
* employed for simplicity.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
#define ESTABLISH_FRAME \
push {r7, lr} ;\
mov r7, sp
#define CLEAR_FRAME_AND_RETURN \
pop {r7, pc}
#define a r0
#define b r1
#define r r2
#define i r3
#define q ip
#define one lr
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__udivsi3)
// We use a simple digit by digit algorithm; before we get into the actual
// divide loop, we must calculate the left-shift amount necessary to align
// the MSB of the divisor with that of the dividend (If this shift is
// negative, then the result is zero, and we early out). We also conjure a
// bit mask of 1 to use in constructing the quotient, and initialize the
// quotient to zero.
ESTABLISH_FRAME
clz r2, a
tst b, b // detect divide-by-zero
clz r3, b
mov q, #0
beq L_return // return 0 if b is zero.
mov one, #1
subs i, r3, r2
blt L_return // return 0 if MSB(a) < MSB(b)
L_mainLoop:
// This loop basically implements the following:
//
// do {
// if (a >= b << i) {
// a -= b << i;
// q |= 1 << i;
// if (a == 0) break;
// }
// } while (--i)
//
// Note that this does not perform the final iteration (i == 0); by doing it
// this way, we can merge the two branches which is a substantial win for
// such a tight loop on current ARM architectures.
subs r, a, b, lsl i
orrhs q, q,one, lsl i
movhs a, r
subsne i, i, #1
bhi L_mainLoop
// Do the final test subtraction and update of quotient (i == 0), as it is
// not performed in the main loop.
subs r, a, b
orrhs q, #1
L_return:
// Move the quotient to r0 and return.
mov r0, q
CLEAR_FRAME_AND_RETURN

View File

@ -0,0 +1,58 @@
/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
*===----------------------------------------------------------------------===//
*
* This file implements the __umodsi3 (32-bit unsigned integer modulus)
* function for the ARM architecture. A naive digit-by-digit computation is
* employed for simplicity.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
#define a r0
#define b r1
#define r r2
#define i r3
.syntax unified
.align 3
DEFINE_COMPILERRT_FUNCTION(__umodsi3)
// We use a simple digit by digit algorithm; before we get into the actual
// divide loop, we must calculate the left-shift amount necessary to align
// the MSB of the divisor with that of the dividend.
clz r2, a
tst b, b // detect b == 0
clz r3, b
bxeq lr // return a if b == 0
subs i, r3, r2
bxlt lr // return a if MSB(a) < MSB(b)
L_mainLoop:
// This loop basically implements the following:
//
// do {
// if (a >= b << i) {
// a -= b << i;
// if (a == 0) break;
// }
// } while (--i)
//
// Note that this does not perform the final iteration (i == 0); by doing it
// this way, we can merge the two branches which is a substantial win for
// such a tight loop on current ARM architectures.
subs r, a, b, lsl i
movhs a, r
subsne i, i, #1
bhi L_mainLoop
// Do the final test subtraction and update of remainder (i == 0), as it is
// not performed in the main loop.
subs r, a, b
movhs a, r
bx lr