[AArch64] Sink splat shuffles to lane index intrinsics

This teaches AArch64TargetLowering::shouldSinkOperands to sink splat
shuffles to certain neon intrinsics, so that they can make use of the
lane variants of the instructions that are available.

Differential Revision: https://reviews.llvm.org/D112994
This commit is contained in:
David Green 2021-11-22 08:11:35 +00:00
parent 83484f8472
commit 760d4d03d5
3 changed files with 63 additions and 19 deletions

View File

@ -11924,6 +11924,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
}
static bool isSplatShuffle(Value *V) {
if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
return is_splat(Shuf->getShuffleMask());
return false;
}
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@ -11934,12 +11940,24 @@ bool AArch64TargetLowering::shouldSinkOperands(
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
return false;
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
}
LLVM_FALLTHROUGH;
case Intrinsic::aarch64_neon_sqdmull:
case Intrinsic::aarch64_neon_sqdmulh:
case Intrinsic::aarch64_neon_sqrdmulh:
// Sink splats for index lane variants
if (isSplatShuffle(II->getOperand(0)))
Ops.push_back(&II->getOperandUse(0));
if (isSplatShuffle(II->getOperand(1)))
Ops.push_back(&II->getOperandUse(1));
return !Ops.empty();
case Intrinsic::aarch64_neon_pmull64:
if (!areOperandsOfVmullHighP64(II->getArgOperand(0),

View File

@ -7,12 +7,11 @@ define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) {
; CHECK-NEXT: fmov d1, d0
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4h, v1.h[3]
; CHECK-NEXT: .LBB0_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3]
; CHECK-NEXT: b.eq .LBB0_1
; CHECK-NEXT: // %bb.2: // %l2
; CHECK-NEXT: ret
@ -40,12 +39,11 @@ define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) {
; CHECK-NEXT: fmov d1, d0
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4h, v1.h[3]
; CHECK-NEXT: .LBB1_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.h[3]
; CHECK-NEXT: b.eq .LBB1_1
; CHECK-NEXT: // %bb.2: // %l2
; CHECK-NEXT: ret
@ -73,12 +71,11 @@ define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) {
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: .LBB2_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3]
; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s
; CHECK-NEXT: b.eq .LBB2_1
; CHECK-NEXT: // %bb.2: // %l2
@ -107,12 +104,11 @@ define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) {
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: .LBB3_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3]
; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: b.eq .LBB3_1
; CHECK-NEXT: // %bb.2: // %l2
@ -141,12 +137,11 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) {
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: .LBB4_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.4s
; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.s[3]
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: b.eq .LBB4_1
; CHECK-NEXT: // %bb.2: // %l2
@ -175,12 +170,11 @@ define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) {
; CHECK-NEXT: fmov d1, d0
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v1.4h, v1.h[3]
; CHECK-NEXT: .LBB5_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.4h
; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.h[3]
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: b.eq .LBB5_1
; CHECK-NEXT: // %bb.2: // %l2

View File

@ -150,6 +150,38 @@ if.else:
ret <8 x i16> %vmull1
}
; The masks used are suitable for umull, sink shufflevector to users.
define <8 x i16> @sink_shufflevector_smull(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: @sink_shufflevector_smull(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
; CHECK-NEXT: ret <8 x i16> [[VMULL0]]
; CHECK: if.else:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
; CHECK-NEXT: ret <8 x i16> [[VMULL1]]
;
entry:
%s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
br i1 undef, label %if.then, label %if.else
if.then:
%s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
ret <8 x i16> %vmull0
if.else:
%s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
ret <8 x i16> %vmull1
}
; Both exts and their shufflevector operands can be sunk.
define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: @sink_shufflevector_ext_subadd(
@ -271,8 +303,8 @@ if.else:
}
; Function Attrs: nounwind readnone
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur.
define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {