[LoopVectorize] Fix assertion failure in fixReduction when tail-folding

When compiling the attached new test in scalable-reductions-tf.ll we were hitting this assertion in fixReduction: Assertion `isa<PHINode>(U) && "Reduction exit must feed Phi's or select" The loop contains a reduction and an intermediate store of the reduction value. When vectorising with tail-folding the contains of 'U' in the assertion above happened to be a scatter_store. It turns out that we were still creating a widen recipe for the invariant store, despite knowing that we can actually sink it. The simplest fix is to change buildVPlanWithVPRecipes so that we look for invariant stores before attempting to widen it. Differential Revision: https://reviews.llvm.org/D126295
2022-05-24 13:24:20 +01:00 · 2022-05-24 13:24:20 +01:00 · 87936c7b13
parent f1df6515e3
commit 87936c7b13
3 changed files with 48 additions and 9 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -8829,6 +8829,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
        auto OpRange = Plan->mapToVPValues(Instr->operands());
        Operands = {OpRange.begin(), OpRange.end()};
      }
+
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
      if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
              Instr, Operands, Range, Plan)) {
        // If Instr can be simplified to an existing VPValue, use it.
@ -8864,13 +8872,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
        continue;
      }

-      // Invariant stores inside loop will be deleted and a single store
-      // with the final reduction value will be added to the exit block
-      StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(&I)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
-        continue;
-
      // Otherwise, if all widening options failed, Instruction is to be
      // replicated. This may create a successor for VPBB.
      VPBasicBlock *NextVPBB =
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
@ -0,0 +1,40 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN:   -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S | FileCheck %s
+
+define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) {
+; CHECK-LABEL: @invariant_store_red_exit_is_phi(
+; CHECK: vector.body:
+; CHECK:      %[[VEC_PHI:.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ]
+; CHECK:      %[[ACTIVE_LANE_MASK:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n)
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32
+; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
+; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
+; CHECK: middle.block:
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %red = phi i32 [ 0, %entry ], [ %storemerge, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %load = load i32, i32* %arrayidx6, align 4
+  %storemerge = add i32 %red, %load
+  store i32 %storemerge, i32* %dst, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@ -328,8 +328,6 @@ define void @invariant_store(i32* %dst, i32* readonly %src) {
 ; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
 ; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
 ; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
 ; CHECK: middle.block:
 ; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
 ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])