llvm-project/compiler-rt/lib/builtins/arm/udivmodsi4.S

//===-- udivmodsi4.S - 32-bit unsigned integer divide and modulus ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the __udivmodsi4 (32-bit unsigned integer divide and
// modulus) function for the ARM 32-bit architecture.
//
//===----------------------------------------------------------------------===//

#include "../assembly.h"

	.syntax unified
	.text
	DEFINE_CODE_STATE

@ unsigned int __udivmodsi4(unsigned int divident, unsigned int divisor,
@                           unsigned int *remainder)
@   Calculate the quotient and remainder of the (unsigned) division.  The return
@   value is the quotient, the remainder is placed in the variable.

	.p2align 2
DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
#if __ARM_ARCH_EXT_IDIV__
	tst     r1, r1
	beq     LOCAL_LABEL(divby0)
	mov 	r3, r0
	udiv	r0, r3, r1
	mls 	r1, r0, r1, r3
	str 	r1, [r2]
	bx  	lr
#else
	cmp	r1, #1
	bcc	LOCAL_LABEL(divby0)
	beq	LOCAL_LABEL(divby1)
	cmp	r0, r1
	bcc	LOCAL_LABEL(quotient0)

	// Implement division using binary long division algorithm.
	//
	// r0 is the numerator, r1 the denominator.
	//
	// The code before JMP computes the correct shift I, so that
	// r0 and (r1 << I) have the highest bit set in the same position.
	// At the time of JMP, ip := .Ldiv0block - 12 * I.
	// This depends on the fixed instruction size of block.
	// For ARM mode, this is 12 Bytes, for THUMB mode 14 Bytes.
	//
	// block(shift) implements the test-and-update-quotient core.
	// It assumes (r0 << shift) can be computed without overflow and
	// that (r0 << shift) < 2 * r1. The quotient is stored in r3.

#  ifdef __ARM_FEATURE_CLZ
	clz	ip, r0
	clz	r3, r1
	// r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3.
	sub	r3, r3, ip
#    if defined(USE_THUMB_2)
	adr	ip, LOCAL_LABEL(div0block) + 1
	sub	ip, ip, r3, lsl #1
#    else
	adr	ip, LOCAL_LABEL(div0block)
#    endif
	sub	ip, ip, r3, lsl #2
	sub	ip, ip, r3, lsl #3
	mov	r3, #0
	bx	ip
#  else
#    if defined(USE_THUMB_2)
#    error THUMB mode requires CLZ or UDIV
#    endif
	str	r4, [sp, #-8]!

	mov	r4, r0
	adr	ip, LOCAL_LABEL(div0block)

	lsr	r3, r4, #16
	cmp	r3, r1
	movhs	r4, r3
	subhs	ip, ip, #(16 * 12)

	lsr	r3, r4, #8
	cmp	r3, r1
	movhs	r4, r3
	subhs	ip, ip, #(8 * 12)

	lsr	r3, r4, #4
	cmp	r3, r1
	movhs	r4, r3
	subhs	ip, #(4 * 12)

	lsr	r3, r4, #2
	cmp	r3, r1
	movhs	r4, r3
	subhs	ip, ip, #(2 * 12)

	// Last block, no need to update r3 or r4.
	cmp	r1, r4, lsr #1
	subls	ip, ip, #(1 * 12)

	ldr	r4, [sp], #8	// restore r4, we are done with it.
	mov	r3, #0

	JMP(ip)
#  endif

#define	IMM	#

#define block(shift)                                                           \
	cmp	r0, r1, lsl IMM shift;                                         \
	ITT(hs);                                                               \
	WIDE(addhs)	r3, r3, IMM (1 << shift);                              \
	WIDE(subhs)	r0, r0, r1, lsl IMM shift

	block(31)
	block(30)
	block(29)
	block(28)
	block(27)
	block(26)
	block(25)
	block(24)
	block(23)
	block(22)
	block(21)
	block(20)
	block(19)
	block(18)
	block(17)
	block(16)
	block(15)
	block(14)
	block(13)
	block(12)
	block(11)
	block(10)
	block(9)
	block(8)
	block(7)
	block(6)
	block(5)
	block(4)
	block(3)
	block(2)
	block(1)
LOCAL_LABEL(div0block):
	block(0)

	str	r0, [r2]
	mov	r0, r3
	JMP(lr)

LOCAL_LABEL(quotient0):
	str	r0, [r2]
	mov	r0, #0
	JMP(lr)

LOCAL_LABEL(divby1):
	mov	r3, #0
	str	r3, [r2]
	JMP(lr)
#endif // __ARM_ARCH_EXT_IDIV__

LOCAL_LABEL(divby0):
	mov	r0, #0
#ifdef __ARM_EABI__
	b	__aeabi_idiv0
#else
	JMP(lr)
#endif

END_COMPILERRT_FUNCTION(__udivmodsi4)

NO_EXEC_STACK_DIRECTIVE
[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00			`//===-- udivmodsi4.S - 32-bit unsigned integer divide and modulus ---------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This file implements the __udivmodsi4 (32-bit unsigned integer divide and`
			`// modulus) function for the ARM 32-bit architecture.`
			`//`
			`//===----------------------------------------------------------------------===//`
Carefully written implementations of the 32-bit integer divide and modulus functions for ARM. These are still using a naive digit-by-digit algorithm, but the core loop has been carefully written. llvm-svn: 127882 2011-03-19 00:35:02 +08:00
			`#include "../assembly.h"`

Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`.syntax unified`
			`.text`
[builtins][ARM] Select correct code fragments when compiling for Thumb1/Thum2/ARM ISA Summary: Value of __ARM_ARCH_ISA_THUMB isn't based on the actual compilation mode (-mthumb, -marm), it reflect's capability of given CPU. Due to this: •use tbumb and thumb2 insteand of __ARM_ARCH_ISA_THUMB •use '.thumb' directive consistently in all affected files •decorate all thumb functions using DEFINE_COMPILERRT_THUMB_FUNCTION() (This is based off Michal's patch https://reviews.llvm.org/D30938) Reviewers: dim, rengolin, compnerd, strejda Reviewed By: compnerd Subscribers: peter.smith, kubamracek, mgorny, javed.absar, kristof.beyls, jamesduley, aemerson, llvm-commits Differential Revision: https://reviews.llvm.org/D31220 llvm-svn: 310884 2017-08-15 04:48:47 +08:00			`DEFINE_CODE_STATE`
Redo THUMB support. Discussed with and tested by: Saleem Abdulrasool llvm-svn: 213481 2014-07-21 04:53:37 +08:00
builtins: add signature to some assembly routines Add a helpful description and a signature for the functions implemented in assembly for the integral math routines. NFC. llvm-svn: 215296 2014-08-10 04:17:43 +08:00			`@ unsigned int __udivmodsi4(unsigned int divident, unsigned int divisor,`
			`@ unsigned int *remainder)`
			`@ Calculate the quotient and remainder of the (unsigned) division. The return`
			`@ value is the quotient, the remainder is placed in the variable.`

Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`.p2align 2`
builtins: rework use of DEFINE_COMPILERRT_THUMB_FUNCTION This is simply to help clarity of the code. The functions are built as thumb only if Thumb2 is available (__ARM_ARCH_ISA_THUMB == 2). Sink the selection into the location of the definition and make DEFINE_COMPILERRT_THUMB_FUNCTION always define a thumb function while DEFINE_COMPILERRT_FUNCTION always selects the default. Since the .thumb_func directive is always available (at least on Linux, Windows, and BSD), sinking the macro right into the macro works just as well. No functional change intended. llvm-svn: 219182 2014-10-07 10:39:13 +08:00			`DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)`
Switch __ARM_ARCH_7S__ to __ARM_ARCH_EXT_IDIV__ for use of sdiv/udiv assembly. __ARM_ARCH_EXT_IDIV__ is the define that ARM is using to indicate the presence of hardware integer divide (sdiv/udiv). Previously, this code was only being invoked for processors marked 7S. We now can correctly generate hardware divides on cortex-a15 devices. llvm-svn: 193392 2013-10-25 14:26:44 +08:00			`#if __ARM_ARCH_EXT_IDIV__`
<rdar://problem/12512722> Use arm divide instruction if available llvm-svn: 182665 2013-05-25 03:38:11 +08:00			`tst r1, r1`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`beq LOCAL_LABEL(divby0)`
<rdar://problem/12512722> Use arm divide instruction if available llvm-svn: 182665 2013-05-25 03:38:11 +08:00			`mov r3, r0`
			`udiv r0, r3, r1`
			`mls r1, r0, r1, r3`
			`str r1, [r2]`
			`bx lr`
			`#else`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`cmp r1, #1`
			`bcc LOCAL_LABEL(divby0)`
			`beq LOCAL_LABEL(divby1)`
			`cmp r0, r1`
			`bcc LOCAL_LABEL(quotient0)`
[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00
			`// Implement division using binary long division algorithm.`
			`//`
			`// r0 is the numerator, r1 the denominator.`
			`//`
			`// The code before JMP computes the correct shift I, so that`
			`// r0 and (r1 << I) have the highest bit set in the same position.`
			`// At the time of JMP, ip := .Ldiv0block - 12 * I.`
			`// This depends on the fixed instruction size of block.`
			`// For ARM mode, this is 12 Bytes, for THUMB mode 14 Bytes.`
			`//`
			`// block(shift) implements the test-and-update-quotient core.`
			`// It assumes (r0 << shift) can be computed without overflow and`
			`// that (r0 << shift) < 2 * r1. The quotient is stored in r3.`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00
			`# ifdef __ARM_FEATURE_CLZ`
			`clz ip, r0`
			`clz r3, r1`
[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00			`// r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3.`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`sub r3, r3, ip`
[builtins][ARM] Select correct code fragments when compiling for Thumb1/Thum2/ARM ISA Summary: Value of __ARM_ARCH_ISA_THUMB isn't based on the actual compilation mode (-mthumb, -marm), it reflect's capability of given CPU. Due to this: •use tbumb and thumb2 insteand of __ARM_ARCH_ISA_THUMB •use '.thumb' directive consistently in all affected files •decorate all thumb functions using DEFINE_COMPILERRT_THUMB_FUNCTION() (This is based off Michal's patch https://reviews.llvm.org/D30938) Reviewers: dim, rengolin, compnerd, strejda Reviewed By: compnerd Subscribers: peter.smith, kubamracek, mgorny, javed.absar, kristof.beyls, jamesduley, aemerson, llvm-commits Differential Revision: https://reviews.llvm.org/D31220 llvm-svn: 310884 2017-08-15 04:48:47 +08:00			`# if defined(USE_THUMB_2)`
Redo THUMB support. Discussed with and tested by: Saleem Abdulrasool llvm-svn: 213481 2014-07-21 04:53:37 +08:00			`adr ip, LOCAL_LABEL(div0block) + 1`
			`sub ip, ip, r3, lsl #1`
			`# else`
Revert r213467, it breaks non-thumb mode. llvm-svn: 213479 2014-07-21 04:00:26 +08:00			`adr ip, LOCAL_LABEL(div0block)`
Redo THUMB support. Discussed with and tested by: Saleem Abdulrasool llvm-svn: 213481 2014-07-21 04:53:37 +08:00			`# endif`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`sub ip, ip, r3, lsl #2`
			`sub ip, ip, r3, lsl #3`
			`mov r3, #0`
			`bx ip`
			`# else`
[builtins][ARM] Select correct code fragments when compiling for Thumb1/Thum2/ARM ISA Summary: Value of __ARM_ARCH_ISA_THUMB isn't based on the actual compilation mode (-mthumb, -marm), it reflect's capability of given CPU. Due to this: •use tbumb and thumb2 insteand of __ARM_ARCH_ISA_THUMB •use '.thumb' directive consistently in all affected files •decorate all thumb functions using DEFINE_COMPILERRT_THUMB_FUNCTION() (This is based off Michal's patch https://reviews.llvm.org/D30938) Reviewers: dim, rengolin, compnerd, strejda Reviewed By: compnerd Subscribers: peter.smith, kubamracek, mgorny, javed.absar, kristof.beyls, jamesduley, aemerson, llvm-commits Differential Revision: https://reviews.llvm.org/D31220 llvm-svn: 310884 2017-08-15 04:48:47 +08:00			`# if defined(USE_THUMB_2)`
Redo THUMB support. Discussed with and tested by: Saleem Abdulrasool llvm-svn: 213481 2014-07-21 04:53:37 +08:00			`# error THUMB mode requires CLZ or UDIV`
			`# endif`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`str r4, [sp, #-8]!`

			`mov r4, r0`
Revert r213467, it breaks non-thumb mode. llvm-svn: 213479 2014-07-21 04:00:26 +08:00			`adr ip, LOCAL_LABEL(div0block)`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00
			`lsr r3, r4, #16`
			`cmp r3, r1`
			`movhs r4, r3`
			`subhs ip, ip, #(16 * 12)`

			`lsr r3, r4, #8`
			`cmp r3, r1`
			`movhs r4, r3`
			`subhs ip, ip, #(8 * 12)`

			`lsr r3, r4, #4`
			`cmp r3, r1`
			`movhs r4, r3`
			`subhs ip, #(4 * 12)`

			`lsr r3, r4, #2`
			`cmp r3, r1`
			`movhs r4, r3`
			`subhs ip, ip, #(2 * 12)`

[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00			`// Last block, no need to update r3 or r4.`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`cmp r1, r4, lsr #1`
			`subls ip, ip, #(1 * 12)`

[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00			`ldr r4, [sp], #8 // restore r4, we are done with it.`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`mov r3, #0`

			`JMP(ip)`
			`# endif`

			`#define IMM #`

builtins: whitespace llvm-svn: 214044 2014-07-27 10:01:15 +08:00			`#define block(shift) \`
			`cmp r0, r1, lsl IMM shift; \`
			`ITT(hs); \`
builtins: make ARM compilation with GAS work again The LLVM IAS seems to accept wide instructions for add and sub in ARM mode even though it is not permitted. This uses a macro to ensure that the wide modifier is only applied when building in THUMB mode. This repairs building with GCC/GAS in ARM mode. llvm-svn: 214046 2014-07-27 10:01:24 +08:00			`WIDE(addhs) r3, r3, IMM (1 << shift); \`
			`WIDE(subhs) r0, r0, r1, lsl IMM shift`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00
			`block(31)`
			`block(30)`
			`block(29)`
			`block(28)`
			`block(27)`
			`block(26)`
			`block(25)`
			`block(24)`
			`block(23)`
			`block(22)`
			`block(21)`
			`block(20)`
			`block(19)`
			`block(18)`
			`block(17)`
			`block(16)`
			`block(15)`
			`block(14)`
			`block(13)`
			`block(12)`
			`block(11)`
			`block(10)`
			`block(9)`
			`block(8)`
			`block(7)`
			`block(6)`
			`block(5)`
			`block(4)`
			`block(3)`
			`block(2)`
			`block(1)`
Revert r213467, it breaks non-thumb mode. llvm-svn: 213479 2014-07-21 04:00:26 +08:00			`LOCAL_LABEL(div0block):`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00			`block(0)`

			`str r0, [r2]`
			`mov r0, r3`
			`JMP(lr)`

			`LOCAL_LABEL(quotient0):`
			`str r0, [r2]`
			`mov r0, #0`
			`JMP(lr)`

			`LOCAL_LABEL(divby1):`
			`mov r3, #0`
			`str r3, [r2]`
			`JMP(lr)`
[builtins] Use single line C++/C99 comment style Use the uniform single line C++/99 style for code comments. This is part of the cleanup proposed in "[RFC] compiler-rt builtins cleanup and refactoring". Differential Revision: https://reviews.llvm.org/D60352 llvm-svn: 359411 2019-04-29 06:47:49 +08:00			`#endif // __ARM_ARCH_EXT_IDIV__`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00
			`LOCAL_LABEL(divby0):`
			`mov r0, #0`
			`#ifdef __ARM_EABI__`
			`b __aeabi_idiv0`
			`#else`
			`JMP(lr)`
<rdar://problem/12512722> Use arm divide instruction if available llvm-svn: 182665 2013-05-25 03:38:11 +08:00			`#endif`
Provide support for ARMv4, lacking bx and clz. Unroll the test-and-subtract loop and compute the initial block as address, shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi. Code written by Matt Thomas and Joerg Sonnenberger. Differential Revision: http://llvm-reviews.chandlerc.com/D2595 llvm-svn: 200001 2014-01-24 21:43:35 +08:00
			`END_COMPILERRT_FUNCTION(__udivmodsi4)`
builtins: tag with noexecstack These routines do not require executable stacks. However, by default ELFish linkers may assume an executable stack on GNUish environments (and some non-GNU ones too!). The GNU extension to add a note to indicate a non-executable stack is honoured by these environments to mark the stack as non-executable (the compiler normally emits this directive on appropriate targets whenever possible). This allows normal builds from getting executable stacks due to linking to the compiler rt builtins. llvm-svn: 273500 2016-06-23 06:09:42 +08:00
			`NO_EXEC_STACK_DIRECTIVE`