forked from OSchip/llvm-project
ARM: fix division in some cases
For ARM cores that are ARMv6T2+ but not ARMv7ve or ARMv7-r and not an updated ARMv7-a that has the idiv extension (chips with clz but not idiv), an incorrect jump would be calculated due to the preference to thumb instructions over ARM. Rather than computing the target at runtime, use a jumptable instead. This trades a bit of storage for performance. The overhead is 32-bytes for each of the three routines, but avoid the calculation of the offset. Because clz was introduced in ARMv6T2 and idiv in certain versions of ARMv7, the non-clz, non-idiv case implies a target which does not support Thumb-2, and thus we cannot use Thumb on those targets (as it is unlikely that the assembly will assemble). Take the opportunity to refactor the IT block macros into assembly.h rather than redefining them in the TUs where they are used. Existing tests cover the full change already, so no new tests are added. This effectively reverts SVN r213309. llvm-svn: 213467
This commit is contained in:
parent
8eb82fc453
commit
8817bfe7e2
|
@ -16,6 +16,9 @@
|
|||
|
||||
.syntax unified
|
||||
.text
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
.thumb
|
||||
#endif
|
||||
|
||||
.p2align 2
|
||||
DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
|
||||
|
@ -38,11 +41,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
|
|||
*
|
||||
* r0 is the numerator, r1 the denominator.
|
||||
*
|
||||
* ARM:
|
||||
* The code before JMP computes the correct shift I, so that
|
||||
* r0 and (r1 << I) have the highest bit set in the same position.
|
||||
* At the time of JMP, ip := .Ldiv0block - 12 * I.
|
||||
* This depends on the fixed instruction size of block.
|
||||
*
|
||||
* Thumb 2:
|
||||
* Uses a jumptable to jump to the appropriate block.
|
||||
*
|
||||
* block(shift) implements the test-and-update-quotient core.
|
||||
* It assumes (r0 << shift) can be computed without overflow and
|
||||
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
|
||||
|
@ -52,17 +59,59 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
|
|||
clz ip, r0
|
||||
clz r3, r1
|
||||
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
sub ip, r3, ip
|
||||
mov r3, #0
|
||||
tbb [pc, ip]
|
||||
LOCAL_LABEL(JT):
|
||||
.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
|
||||
#else
|
||||
sub r3, r3, ip
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
sub ip, ip, r3, lsl #2
|
||||
sub ip, ip, r3, lsl #3
|
||||
mov r3, #0
|
||||
bx ip
|
||||
#endif
|
||||
# else
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
#error unsupported configuration
|
||||
#endif
|
||||
str r4, [sp, #-8]!
|
||||
|
||||
mov r4, r0
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
|
||||
lsr r3, r4, #16
|
||||
cmp r3, r1
|
||||
|
@ -96,9 +145,11 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
|
|||
|
||||
#define IMM #
|
||||
|
||||
#define block(shift) \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
addhs r3, r3, IMM (1 << shift); \
|
||||
#define block(shift) \
|
||||
LOCAL_LABEL(shift): \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
ITT hs; \
|
||||
addhs r3, r3, IMM (1 << shift); \
|
||||
subhs r0, r0, r1, lsl IMM shift
|
||||
|
||||
block(31)
|
||||
|
@ -132,7 +183,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
|
|||
block(3)
|
||||
block(2)
|
||||
block(1)
|
||||
LOCAL_LABEL(div0block):
|
||||
block(0)
|
||||
|
||||
str r0, [r2]
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
|
||||
.syntax unified
|
||||
.text
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
.thumb
|
||||
#endif
|
||||
|
||||
.p2align 2
|
||||
DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
|
||||
|
@ -32,6 +35,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
|
|||
bcc LOCAL_LABEL(divby0)
|
||||
JMPc(lr, eq)
|
||||
cmp r0, r1
|
||||
IT cc
|
||||
movcc r0, #0
|
||||
JMPc(lr, cc)
|
||||
/*
|
||||
|
@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
|
|||
*
|
||||
* r0 is the numerator, r1 the denominator.
|
||||
*
|
||||
* ARM:
|
||||
* The code before JMP computes the correct shift I, so that
|
||||
* r0 and (r1 << I) have the highest bit set in the same position.
|
||||
* At the time of JMP, ip := .Ldiv0block - 12 * I.
|
||||
* This depends on the fixed instruction size of block.
|
||||
*
|
||||
* Thumb 2:
|
||||
* Uses a jumptable to jump to the appropriate block.
|
||||
*
|
||||
* block(shift) implements the test-and-update-quotient core.
|
||||
* It assumes (r0 << shift) can be computed without overflow and
|
||||
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
|
||||
|
@ -53,15 +61,57 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
|
|||
clz ip, r0
|
||||
clz r3, r1
|
||||
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
sub ip, r3, ip
|
||||
mov r3, #0
|
||||
tbb [pc, ip]
|
||||
LOCAL_LABEL(JT):
|
||||
.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
|
||||
#else
|
||||
sub r3, r3, ip
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
sub ip, ip, r3, lsl #2
|
||||
sub ip, ip, r3, lsl #3
|
||||
mov r3, #0
|
||||
bx ip
|
||||
#endif
|
||||
# else
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
#error unsupported configuration
|
||||
#endif
|
||||
mov r2, r0
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
|
||||
lsr r3, r2, #16
|
||||
cmp r3, r1
|
||||
|
@ -94,10 +144,12 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
|
|||
|
||||
#define IMM #
|
||||
|
||||
#define block(shift) \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
addhs r3, r3, IMM (1 << shift); \
|
||||
subhs r0, r0, r1, lsl IMM shift
|
||||
#define block(shift) \
|
||||
LOCAL_LABEL(shift): \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
ITT hs; \
|
||||
addhs r3, r3, IMM(1 << shift); \
|
||||
subhs r0, r0, r1, lsl IMM shift
|
||||
|
||||
block(31)
|
||||
block(30)
|
||||
|
@ -130,7 +182,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
|
|||
block(3)
|
||||
block(2)
|
||||
block(1)
|
||||
LOCAL_LABEL(div0block):
|
||||
block(0)
|
||||
|
||||
mov r0, r3
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
|
||||
.syntax unified
|
||||
.text
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
.thumb
|
||||
#endif
|
||||
|
||||
.p2align 2
|
||||
DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
||||
|
@ -30,6 +33,7 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
|||
#else
|
||||
cmp r1, #1
|
||||
bcc LOCAL_LABEL(divby0)
|
||||
IT eq
|
||||
moveq r0, #0
|
||||
JMPc(lr, eq)
|
||||
cmp r0, r1
|
||||
|
@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
|||
*
|
||||
* r0 is the numerator, r1 the denominator.
|
||||
*
|
||||
* For ARM:
|
||||
* The code before JMP computes the correct shift I, so that
|
||||
* r0 and (r1 << I) have the highest bit set in the same position.
|
||||
* At the time of JMP, ip := .Ldiv0block - 8 * I.
|
||||
* This depends on the fixed instruction size of block.
|
||||
*
|
||||
* For Thumb:
|
||||
* Uses a jumptable to jump to the appropriate block.
|
||||
*
|
||||
* block(shift) implements the test-and-update-quotient core.
|
||||
* It assumes (r0 << shift) can be computed without overflow and
|
||||
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
|
||||
|
@ -54,12 +62,52 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
|||
clz r3, r1
|
||||
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
|
||||
sub r3, r3, ip
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
tbb [pc, r3]
|
||||
LOCAL_LABEL(JT):
|
||||
.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
|
||||
.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
|
||||
#else
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
sub ip, ip, r3, lsl #3
|
||||
bx ip
|
||||
#endif
|
||||
# else
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
#error unsupported configuration
|
||||
#endif
|
||||
mov r2, r0
|
||||
adr ip, LOCAL_LABEL(div0block)
|
||||
adr ip, LOCAL_LABEL(0)
|
||||
|
||||
lsr r3, r2, #16
|
||||
cmp r3, r1
|
||||
|
@ -90,9 +138,11 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
|||
|
||||
#define IMM #
|
||||
|
||||
#define block(shift) \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
subhs r0, r0, r1, lsl IMM shift
|
||||
#define block(shift) \
|
||||
LOCAL_LABEL(shift): \
|
||||
cmp r0, r1, lsl IMM shift; \
|
||||
IT hs; \
|
||||
subhs r0, r0, r1, lsl IMM shift
|
||||
|
||||
block(31)
|
||||
block(30)
|
||||
|
@ -125,7 +175,6 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
|
|||
block(3)
|
||||
block(2)
|
||||
block(1)
|
||||
LOCAL_LABEL(div0block):
|
||||
block(0)
|
||||
JMP(lr)
|
||||
#endif /* __ARM_ARCH_EXT_IDIV__ */
|
||||
|
|
|
@ -22,6 +22,16 @@
|
|||
#define SEPARATOR ;
|
||||
#endif
|
||||
|
||||
#if defined(__arm__)
|
||||
#if __ARM_ARCH_ISA_THUMB == 2
|
||||
#define IT it
|
||||
#define ITT itt
|
||||
#else
|
||||
#define IT @
|
||||
#define ITT @
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#define HIDDEN(name) .private_extern name
|
||||
#define LOCAL_LABEL(name) L_##name
|
||||
|
@ -86,7 +96,9 @@
|
|||
|
||||
#ifdef ARM_HAS_BX
|
||||
#define JMP(r) bx r
|
||||
#define JMPc(r, c) bx##c r
|
||||
#define JMPc(r, c) \
|
||||
IT c; \
|
||||
bx##c r
|
||||
#else
|
||||
#define JMP(r) mov pc, r
|
||||
#define JMPc(r, c) mov##c pc, r
|
||||
|
|
Loading…
Reference in New Issue