[AArch64] Sink splat shuffles to lane index intrinsics

This teaches AArch64TargetLowering::shouldSinkOperands to sink splat shuffles to certain neon intrinsics, so that they can make use of the lane variants of the instructions that are available. Differential Revision: https://reviews.llvm.org/D112994
2021-11-22 08:11:35 +00:00 · 2021-11-22 08:11:35 +00:00 · 760d4d03d5
parent 83484f8472
commit 760d4d03d5
3 changed files with 63 additions and 19 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -11924,6 +11924,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
  return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
 }

+static bool isSplatShuffle(Value *V) {
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
+    return is_splat(Shuf->getShuffleMask());
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@ -11934,12 +11940,24 @@ bool AArch64TargetLowering::shouldSinkOperands(

  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
    switch (II->getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_smull:
    case Intrinsic::aarch64_neon_umull:
-      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
-        return false;
-      Ops.push_back(&II->getOperandUse(0));
-      Ops.push_back(&II->getOperandUse(1));
-      return true;
+      if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
+        Ops.push_back(&II->getOperandUse(0));
+        Ops.push_back(&II->getOperandUse(1));
+        return true;
+      }
+      LLVM_FALLTHROUGH;
+
+    case Intrinsic::aarch64_neon_sqdmull:
+    case Intrinsic::aarch64_neon_sqdmulh:
+    case Intrinsic::aarch64_neon_sqrdmulh:
+      // Sink splats for index lane variants
+      if (isSplatShuffle(II->getOperand(0)))
+        Ops.push_back(&II->getOperandUse(0));
+      if (isSplatShuffle(II->getOperand(1)))
+        Ops.push_back(&II->getOperandUse(1));
+      return !Ops.empty();

    case Intrinsic::aarch64_neon_pmull64:
      if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@ -7,12 +7,11 @@ define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB0_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    smlal v0.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    b.eq .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
@ -40,12 +39,11 @@ define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB1_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    b.eq .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
@ -73,12 +71,11 @@ define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB2_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@ -107,12 +104,11 @@ define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB3_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@ -141,12 +137,11 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB4_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB4_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@ -175,12 +170,11 @@ define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB5_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqdmull v2.4s, v2.4h, v1.4h
+; CHECK-NEXT:    sqdmull v2.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB5_1
 ; CHECK-NEXT:  // %bb.2: // %l2
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@ -150,6 +150,38 @@ if.else:
  ret <8 x i16> %vmull1
 }

+; The masks used are suitable for umull, sink shufflevector to users.
+define <8 x i16> @sink_shufflevector_smull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_smull(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  ret <8 x i16> %vmull0
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  ret <8 x i16> %vmull1
+}
+
 ; Both exts and their shufflevector operands can be sunk.
 define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: @sink_shufflevector_ext_subadd(
@ -271,8 +303,8 @@ if.else:
 }


-; Function Attrs: nounwind readnone
-declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)

 ; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur.
 define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {