[ARM] ACLE Chapter 9 intrinsics

Implemented the remaining integer data processing intrinsics from
the ARM ACLE v2.1 spec, such as parallel arithemtic and DSP style
multiplications.

Differential Revision: https://reviews.llvm.org/D32282

llvm-svn: 302131
This commit is contained in:
Sam Parker 2017-05-04 08:37:59 +00:00
parent 927edebd04
commit b9ea36f9c1
3 changed files with 788 additions and 7 deletions

View File

@ -25,11 +25,93 @@
// In libgcc
BUILTIN(__clear_cache, "vv*v*", "i")
// 16-bit multiplications
BUILTIN(__builtin_arm_smulbb, "iii", "nc")
BUILTIN(__builtin_arm_smulbt, "iii", "nc")
BUILTIN(__builtin_arm_smultb, "iii", "nc")
BUILTIN(__builtin_arm_smultt, "iii", "nc")
BUILTIN(__builtin_arm_smulwb, "iii", "nc")
BUILTIN(__builtin_arm_smulwt, "iii", "nc")
// Saturating arithmetic
BUILTIN(__builtin_arm_qadd, "iii", "nc")
BUILTIN(__builtin_arm_qsub, "iii", "nc")
BUILTIN(__builtin_arm_ssat, "iiUi", "nc")
BUILTIN(__builtin_arm_usat, "UiUiUi", "nc")
BUILTIN(__builtin_arm_usat, "UiiUi", "nc")
BUILTIN(__builtin_arm_smlabb, "iiii", "nc")
BUILTIN(__builtin_arm_smlabt, "iiii", "nc")
BUILTIN(__builtin_arm_smlatb, "iiii", "nc")
BUILTIN(__builtin_arm_smlatt, "iiii", "nc")
BUILTIN(__builtin_arm_smlawb, "iiii", "nc")
BUILTIN(__builtin_arm_smlawt, "iiii", "nc")
BUILTIN(__builtin_arm_ssat16, "iii", "nc")
BUILTIN(__builtin_arm_usat16, "iii", "nc")
BUILTIN(__builtin_arm_sxtab16, "iii", "nc")
BUILTIN(__builtin_arm_sxtb16, "ii", "nc")
BUILTIN(__builtin_arm_uxtab16, "iii", "nc")
BUILTIN(__builtin_arm_uxtb16, "ii", "nc")
BUILTIN(__builtin_arm_sel, "iii", "nc")
BUILTIN(__builtin_arm_qadd8, "iii", "nc")
BUILTIN(__builtin_arm_qsub8, "iii", "nc")
BUILTIN(__builtin_arm_sadd8, "iii", "nc")
BUILTIN(__builtin_arm_shadd8, "iii", "nc")
BUILTIN(__builtin_arm_shsub8, "iii", "nc")
BUILTIN(__builtin_arm_ssub8, "iii", "nc")
BUILTIN(__builtin_arm_uadd8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhadd8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhsub8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqadd8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqsub8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_usub8, "UiUiUi", "nc")
// Sum of 8-bit absolute differences
BUILTIN(__builtin_arm_usad8, "UiUiUi", "nc")
BUILTIN(__builtin_arm_usada8, "UiUiUiUi", "nc")
// Parallel 16-bit addition and subtraction
BUILTIN(__builtin_arm_qadd16, "iii", "nc")
BUILTIN(__builtin_arm_qasx, "iii", "nc")
BUILTIN(__builtin_arm_qsax, "iii", "nc")
BUILTIN(__builtin_arm_qsub16, "iii", "nc")
BUILTIN(__builtin_arm_sadd16, "iii", "nc")
BUILTIN(__builtin_arm_sasx, "iii", "nc")
BUILTIN(__builtin_arm_shadd16, "iii", "nc")
BUILTIN(__builtin_arm_shasx, "iii", "nc")
BUILTIN(__builtin_arm_shsax, "iii", "nc")
BUILTIN(__builtin_arm_shsub16, "iii", "nc")
BUILTIN(__builtin_arm_ssax, "iii", "nc")
BUILTIN(__builtin_arm_ssub16, "iii", "nc")
BUILTIN(__builtin_arm_uadd16, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uasx, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhadd16, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhasx, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhsax, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uhsub16, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqadd16, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqasx, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqsax, "UiUiUi", "nc")
BUILTIN(__builtin_arm_uqsub16, "UiUiUi", "nc")
BUILTIN(__builtin_arm_usax, "UiUiUi", "nc")
BUILTIN(__builtin_arm_usub16, "UiUiUi", "nc")
// Parallel 16-bit multiplication
BUILTIN(__builtin_arm_smlad, "iiii", "nc")
BUILTIN(__builtin_arm_smladx, "iiii", "nc")
BUILTIN(__builtin_arm_smlald, "LLiiiLLi", "nc")
BUILTIN(__builtin_arm_smlaldx, "LLiiiLLi", "nc")
BUILTIN(__builtin_arm_smlsd, "iiii", "nc")
BUILTIN(__builtin_arm_smlsdx, "iiii", "nc")
BUILTIN(__builtin_arm_smlsld, "LLiiiLLi", "nc")
BUILTIN(__builtin_arm_smlsldx, "LLiiiLLi", "nc")
BUILTIN(__builtin_arm_smuad, "iii", "nc")
BUILTIN(__builtin_arm_smuadx, "iii", "nc")
BUILTIN(__builtin_arm_smusd, "iii", "nc")
BUILTIN(__builtin_arm_smusdx, "iii", "nc")
// Bit manipulation
BUILTIN(__builtin_arm_rbit, "UiUi", "nc")

View File

@ -224,6 +224,36 @@ __rbitl(unsigned long __t) {
#endif
}
/*
* 9.3 16-bit multiplications
*/
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbb(int32_t __a, int32_t __b) {
return __builtin_arm_smulbb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbt(int32_t __a, int32_t __b) {
return __builtin_arm_smulbt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultb(int32_t __a, int32_t __b) {
return __builtin_arm_smultb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultt(int32_t __a, int32_t __b) {
return __builtin_arm_smultt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwb(int32_t __a, int32_t __b) {
return __builtin_arm_smulwb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwt(int32_t __a, int32_t __b) {
return __builtin_arm_smulwt(__a, __b);
}
#endif
/*
* 9.4 Saturating intrinsics
*
@ -231,13 +261,13 @@ __rbitl(unsigned long __t) {
* intrinsics are implemented and the flag is enabled.
*/
/* 9.4.1 Width-specified saturation intrinsics */
#if __ARM_32BIT_STATE
#if __ARM_FEATURE_SAT
#define __ssat(x, y) __builtin_arm_ssat(x, y)
#define __usat(x, y) __builtin_arm_usat(x, y)
#endif
/* 9.4.2 Saturating addition and subtraction intrinsics */
#if __ARM_32BIT_STATE
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t __t, int32_t __v) {
return __builtin_arm_qadd(__t, __v);
@ -254,6 +284,290 @@ __qdbl(int32_t __t) {
}
#endif
/* 9.4.3 Accumultating multiplications */
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawt(__a, __b, __c);
}
#endif
/* 9.5.4 Parallel 16-bit saturation */
#if __ARM_FEATURE_SIMD32
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
#define __usat16(x, y) __builtin_arm_usat16(x, y)
#endif
/* 9.5.5 Packing and unpacking */
#if __ARM_FEATURE_SIMD32
typedef int32_t int8x4_t;
typedef int32_t int16x2_t;
typedef uint32_t uint8x4_t;
typedef uint32_t uint16x2_t;
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_sxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtb16(int8x4_t __a) {
return __builtin_arm_sxtb16(__a);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_uxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtb16(int8x4_t __a) {
return __builtin_arm_uxtb16(__a);
}
#endif
/* 9.5.6 Parallel selection */
#if __ARM_FEATURE_SIMD32
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__sel(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_sel(__a, __b);
}
#endif
/* 9.5.7 Parallel 8-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__sadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_sadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__ssub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_ssub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__usub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usub8(__a, __b);
}
#endif
/* 9.5.8 Sum of 8-bit absolute differences */
#if __ARM_FEATURE_SIMD32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usad8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usad8(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
return __builtin_arm_usada8(__a, __b, __c);
}
#endif
/* 9.5.9 Parallel 16-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usub16(__a, __b);
}
#endif
/* 9.5.10 Parallel 16-bit multiplications */
#if __ARM_FEATURE_SIMD32
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlad(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smladx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlald(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlaldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsd(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsdx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsld(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuad(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuad(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuadx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuadx(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusd(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusd(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusdx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusdx(__a, __b);
}
#endif
/* 9.7 CRC32 intrinsics */
#if __ARM_FEATURE_CRC32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))

View File

@ -76,7 +76,7 @@ void test_dbg(void) {
// AArch32: call i32 @llvm.arm.strex
// AArch64: call i64 @llvm.aarch64.ldxr
// AArch64: call i32 @llvm.aarch64.stxr
uint32_t test_swp(uint32_t x, volatile void *p) {
void test_swp(uint32_t x, volatile void *p) {
__swp(x, p);
}
@ -118,6 +118,7 @@ void test_nop(void) {
}
/* 9 DATA-PROCESSING INTRINSICS */
/* 9.2 Miscellaneous data-processing intrinsics */
// ARM-LABEL: test_ror
// ARM: lshr
@ -266,8 +267,7 @@ uint64_t test_rbitll(uint64_t t) {
}
/* 9.4 Saturating intrinsics */
#ifdef __ARM_32BIT_STATE
#ifdef __ARM_FEATURE_SAT
/* 9.4.1 Width-specified saturation intrinsics */
// AArch32-LABEL: test_ssat
// AArch32: call i32 @llvm.arm.ssat(i32 %t, i32 1)
@ -277,11 +277,13 @@ int32_t test_ssat(int32_t t) {
// AArch32-LABEL: test_usat
// AArch32: call i32 @llvm.arm.usat(i32 %t, i32 2)
int32_t test_usat(int32_t t) {
uint32_t test_usat(int32_t t) {
return __usat(t, 2);
}
#endif
/* 9.4.2 Saturating addition and subtraction intrinsics */
#ifdef __ARM_FEATURE_DSP
// AArch32-LABEL: test_qadd
// AArch32: call i32 @llvm.arm.qadd(i32 %a, i32 %b)
int32_t test_qadd(int32_t a, int32_t b) {
@ -304,6 +306,389 @@ int32_t test_qdbl() {
}
#endif
/*
* 9.3 16-bit multiplications
*/
#if __ARM_FEATURE_DSP
// AArch32-LABEL: test_smulbb
// AArch32: call i32 @llvm.arm.smulbb
int32_t test_smulbb(int32_t a, int32_t b) {
return __smulbb(a, b);
}
// AArch32-LABEL: test_smulbt
// AArch32: call i32 @llvm.arm.smulbt
int32_t test_smulbt(int32_t a, int32_t b) {
return __smulbt(a, b);
}
// AArch32-LABEL: test_smultb
// AArch32: call i32 @llvm.arm.smultb
int32_t test_smultb(int32_t a, int32_t b) {
return __smultb(a, b);
}
// AArch32-LABEL: test_smultt
// AArch32: call i32 @llvm.arm.smultt
int32_t test_smultt(int32_t a, int32_t b) {
return __smultt(a, b);
}
// AArch32-LABEL: test_smulwb
// AArch32: call i32 @llvm.arm.smulwb
int32_t test_smulwb(int32_t a, int32_t b) {
return __smulwb(a, b);
}
// AArch32-LABEL: test_smulwt
// AArch32: call i32 @llvm.arm.smulwt
int32_t test_smulwt(int32_t a, int32_t b) {
return __smulwt(a, b);
}
#endif
/* 9.4.3 Accumultating multiplications */
#if __ARM_FEATURE_DSP
// AArch32-LABEL: test_smlabb
// AArch32: call i32 @llvm.arm.smlabb(i32 %a, i32 %b, i32 %c)
int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
return __smlabb(a, b, c);
}
// AArch32-LABEL: test_smlabt
// AArch32: call i32 @llvm.arm.smlabt(i32 %a, i32 %b, i32 %c)
int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
return __smlabt(a, b, c);
}
// AArch32-LABEL: test_smlatb
// AArch32: call i32 @llvm.arm.smlatb(i32 %a, i32 %b, i32 %c)
int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
return __smlatb(a, b, c);
}
// AArch32-LABEL: test_smlatt
// AArch32: call i32 @llvm.arm.smlatt(i32 %a, i32 %b, i32 %c)
int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
return __smlatt(a, b, c);
}
// AArch32-LABEL: test_smlawb
// AArch32: call i32 @llvm.arm.smlawb(i32 %a, i32 %b, i32 %c)
int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
return __smlawb(a, b, c);
}
// AArch32-LABEL: test_smlawt
// AArch32: call i32 @llvm.arm.smlawt(i32 %a, i32 %b, i32 %c)
int32_t test_smlawt(int32_t a, int32_t b, int32_t c) {
return __smlawt(a, b, c);
}
#endif
/* 9.5.4 Parallel 16-bit saturation */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_ssat16
// AArch32: call i32 @llvm.arm.ssat16
int16x2_t test_ssat16(int16x2_t a) {
return __ssat16(a, 15);
}
// AArch32-LABEL: test_usat16
// AArch32: call i32 @llvm.arm.usat16
uint16x2_t test_usat16(int16x2_t a) {
return __usat16(a, 15);
}
#endif
/* 9.5.5 Packing and unpacking */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_sxtab16
// AArch32: call i32 @llvm.arm.sxtab16
int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) {
return __sxtab16(a, b);
}
// AArch32-LABEL: test_sxtb16
// AArch32: call i32 @llvm.arm.sxtb16
int16x2_t test_sxtb16(int8x4_t a) {
return __sxtb16(a);
}
// AArch32-LABEL: test_uxtab16
// AArch32: call i32 @llvm.arm.uxtab16
int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) {
return __uxtab16(a, b);
}
// AArch32-LABEL: test_uxtb16
// AArch32: call i32 @llvm.arm.uxtb16
int16x2_t test_uxtb16(int8x4_t a) {
return __uxtb16(a);
}
#endif
/* 9.5.6 Parallel selection */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_sel
// AArch32: call i32 @llvm.arm.sel
uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) {
return __sel(a, b);
}
#endif
/* 9.5.7 Parallel 8-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_qadd8
// AArch32: call i32 @llvm.arm.qadd8
int16x2_t test_qadd8(int8x4_t a, int8x4_t b) {
return __qadd8(a, b);
}
// AArch32-LABEL: test_qsub8
// AArch32: call i32 @llvm.arm.qsub8
int8x4_t test_qsub8(int8x4_t a, int8x4_t b) {
return __qsub8(a, b);
}
// AArch32-LABEL: test_sadd8
// AArch32: call i32 @llvm.arm.sadd8
int8x4_t test_sadd8(int8x4_t a, int8x4_t b) {
return __sadd8(a, b);
}
// AArch32-LABEL: test_shadd8
// AArch32: call i32 @llvm.arm.shadd8
int8x4_t test_shadd8(int8x4_t a, int8x4_t b) {
return __shadd8(a, b);
}
// AArch32-LABEL: test_shsub8
// AArch32: call i32 @llvm.arm.shsub8
int8x4_t test_shsub8(int8x4_t a, int8x4_t b) {
return __shsub8(a, b);
}
// AArch32-LABEL: test_ssub8
// AArch32: call i32 @llvm.arm.ssub8
int8x4_t test_ssub8(int8x4_t a, int8x4_t b) {
return __ssub8(a, b);
}
// AArch32-LABEL: test_uadd8
// AArch32: call i32 @llvm.arm.uadd8
uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) {
return __uadd8(a, b);
}
// AArch32-LABEL: test_uhadd8
// AArch32: call i32 @llvm.arm.uhadd8
uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) {
return __uhadd8(a, b);
}
// AArch32-LABEL: test_uhsub8
// AArch32: call i32 @llvm.arm.uhsub8
uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) {
return __uhsub8(a, b);
}
// AArch32-LABEL: test_uqadd8
// AArch32: call i32 @llvm.arm.uqadd8
uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) {
return __uqadd8(a, b);
}
// AArch32-LABEL: test_uqsub8
// AArch32: call i32 @llvm.arm.uqsub8
uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) {
return __uqsub8(a, b);
}
// AArch32-LABEL: test_usub8
// AArch32: call i32 @llvm.arm.usub8
uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) {
return __usub8(a, b);
}
#endif
/* 9.5.8 Sum of 8-bit absolute differences */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_usad8
// AArch32: call i32 @llvm.arm.usad8
uint32_t test_usad8(uint8x4_t a, uint8x4_t b) {
return __usad8(a, b);
}
// AArch32-LABEL: test_usada8
// AArch32: call i32 @llvm.arm.usada8
uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) {
return __usada8(a, b, c);
}
#endif
/* 9.5.9 Parallel 16-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_qadd16
// AArch32: call i32 @llvm.arm.qadd16
int16x2_t test_qadd16(int16x2_t a, int16x2_t b) {
return __qadd16(a, b);
}
// AArch32-LABEL: test_qasx
// AArch32: call i32 @llvm.arm.qasx
int16x2_t test_qasx(int16x2_t a, int16x2_t b) {
return __qasx(a, b);
}
// AArch32-LABEL: test_qsax
// AArch32: call i32 @llvm.arm.qsax
int16x2_t test_qsax(int16x2_t a, int16x2_t b) {
return __qsax(a, b);
}
// AArch32-LABEL: test_qsub16
// AArch32: call i32 @llvm.arm.qsub16
int16x2_t test_qsub16(int16x2_t a, int16x2_t b) {
return __qsub16(a, b);
}
// AArch32-LABEL: test_sadd16
// AArch32: call i32 @llvm.arm.sadd16
int16x2_t test_sadd16(int16x2_t a, int16x2_t b) {
return __sadd16(a, b);
}
// AArch32-LABEL: test_sasx
// AArch32: call i32 @llvm.arm.sasx
int16x2_t test_sasx(int16x2_t a, int16x2_t b) {
return __sasx(a, b);
}
// AArch32-LABEL: test_shadd16
// AArch32: call i32 @llvm.arm.shadd16
int16x2_t test_shadd16(int16x2_t a, int16x2_t b) {
return __shadd16(a, b);
}
// AArch32-LABEL: test_shasx
// AArch32: call i32 @llvm.arm.shasx
int16x2_t test_shasx(int16x2_t a, int16x2_t b) {
return __shasx(a, b);
}
// AArch32-LABEL: test_shsax
// AArch32: call i32 @llvm.arm.shsax
int16x2_t test_shsax(int16x2_t a, int16x2_t b) {
return __shsax(a, b);
}
// AArch32-LABEL: test_shsub16
// AArch32: call i32 @llvm.arm.shsub16
int16x2_t test_shsub16(int16x2_t a, int16x2_t b) {
return __shsub16(a, b);
}
// AArch32-LABEL: test_ssax
// AArch32: call i32 @llvm.arm.ssax
int16x2_t test_ssax(int16x2_t a, int16x2_t b) {
return __ssax(a, b);
}
// AArch32-LABEL: test_ssub16
// AArch32: call i32 @llvm.arm.ssub16
int16x2_t test_ssub16(int16x2_t a, int16x2_t b) {
return __ssub16(a, b);
}
// AArch32-LABEL: test_uadd16
// AArch32: call i32 @llvm.arm.uadd16
uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) {
return __uadd16(a, b);
}
// AArch32-LABEL: test_uasx
// AArch32: call i32 @llvm.arm.uasx
uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) {
return __uasx(a, b);
}
// AArch32-LABEL: test_uhadd16
// AArch32: call i32 @llvm.arm.uhadd16
uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) {
return __uhadd16(a, b);
}
// AArch32-LABEL: test_uhasx
// AArch32: call i32 @llvm.arm.uhasx
uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) {
return __uhasx(a, b);
}
// AArch32-LABEL: test_uhsax
// AArch32: call i32 @llvm.arm.uhsax
uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) {
return __uhsax(a, b);
}
// AArch32-LABEL: test_uhsub16
// AArch32: call i32 @llvm.arm.uhsub16
uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) {
return __uhsub16(a, b);
}
// AArch32-LABEL: test_uqadd16
// AArch32: call i32 @llvm.arm.uqadd16
uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) {
return __uqadd16(a, b);
}
// AArch32-LABEL: test_uqasx
// AArch32: call i32 @llvm.arm.uqasx
uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) {
return __uqasx(a, b);
}
// AArch32-LABEL: test_uqsax
// AArch32: call i32 @llvm.arm.uqsax
uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) {
return __uqsax(a, b);
}
// AArch32-LABEL: test_uqsub16
// AArch32: call i32 @llvm.arm.uqsub16
uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) {
return __uqsub16(a, b);
}
// AArch32-LABEL: test_usax
// AArch32: call i32 @llvm.arm.usax
uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) {
return __usax(a, b);
}
// AArch32-LABEL: test_usub16
// AArch32: call i32 @llvm.arm.usub16
uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) {
return __usub16(a, b);
}
#endif
/* 9.5.10 Parallel 16-bit multiplications */
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: test_smlad
// AArch32: call i32 @llvm.arm.smlad
int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) {
return __smlad(a, b, c);
}
// AArch32-LABEL: test_smladx
// AArch32: call i32 @llvm.arm.smladx
int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) {
return __smladx(a, b, c);
}
// AArch32-LABEL: test_smlald
// AArch32: call i64 @llvm.arm.smlald
int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) {
return __smlald(a, b, c);
}
// AArch32-LABEL: test_smlaldx
// AArch32: call i64 @llvm.arm.smlaldx
int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) {
return __smlaldx(a, b, c);
}
// AArch32-LABEL: test_smlsd
// AArch32: call i32 @llvm.arm.smlsd
int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) {
return __smlsd(a, b, c);
}
// AArch32-LABEL: test_smlsdx
// AArch32: call i32 @llvm.arm.smlsdx
int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) {
return __smlsdx(a, b, c);
}
// AArch32-LABEL: test_smlsld
// AArch32: call i64 @llvm.arm.smlsld
int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) {
return __smlsld(a, b, c);
}
// AArch32-LABEL: test_smlsldx
// AArch32: call i64 @llvm.arm.smlsldx
int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) {
return __smlsldx(a, b, c);
}
// AArch32-LABEL: test_smuad
// AArch32: call i32 @llvm.arm.smuad
int32_t test_smuad(int16x2_t a, int16x2_t b) {
return __smuad(a, b);
}
// AArch32-LABEL: test_smuadx
// AArch32: call i32 @llvm.arm.smuadx
int32_t test_smuadx(int16x2_t a, int16x2_t b) {
return __smuadx(a, b);
}
// AArch32-LABEL: test_smusd
// AArch32: call i32 @llvm.arm.smusd
int32_t test_smusd(int16x2_t a, int16x2_t b) {
return __smusd(a, b);
}
// AArch32-LABEL: test_smusdx
// AArch32: call i32 @llvm.arm.smusdx
int32_t test_smusdx(int16x2_t a, int16x2_t b) {
return __smusdx(a, b);
}
#endif
/* 9.7 CRC32 intrinsics */
// ARM-LABEL: test_crc32b
// AArch32: call i32 @llvm.arm.crc32b