[LoopVectorize] Fix assertion failure in fixReduction when tail-folding

When compiling the attached new test in scalable-reductions-tf.ll we
were hitting this assertion in fixReduction:

  Assertion `isa<PHINode>(U) && "Reduction exit must feed Phi's or select"

The loop contains a reduction and an intermediate store of the reduction
value. When vectorising with tail-folding the contains of 'U' in the
assertion above happened to be a scatter_store. It turns out that we
were still creating a widen recipe for the invariant store, despite
knowing that we can actually sink it. The simplest fix is to change
buildVPlanWithVPRecipes so that we look for invariant stores before
attempting to widen it.

Differential Revision: https://reviews.llvm.org/D126295
This commit is contained in:
David Sherwood 2022-05-24 13:24:20 +01:00
parent f1df6515e3
commit 87936c7b13
3 changed files with 48 additions and 9 deletions

View File

@ -8829,6 +8829,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
auto OpRange = Plan->mapToVPValues(Instr->operands());
Operands = {OpRange.begin(), OpRange.end()};
}
// Invariant stores inside loop will be deleted and a single store
// with the final reduction value will be added to the exit block
StoreInst *SI;
if ((SI = dyn_cast<StoreInst>(&I)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
Instr, Operands, Range, Plan)) {
// If Instr can be simplified to an existing VPValue, use it.
@ -8864,13 +8872,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
continue;
}
// Invariant stores inside loop will be deleted and a single store
// with the final reduction value will be added to the exit block
StoreInst *SI;
if ((SI = dyn_cast<StoreInst>(&I)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
VPBasicBlock *NextVPBB =

View File

@ -0,0 +1,40 @@
; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize \
; RUN: -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S | FileCheck %s
define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) {
; CHECK-LABEL: @invariant_store_red_exit_is_phi(
; CHECK: vector.body:
; CHECK: %[[VEC_PHI:.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ]
; CHECK: %[[ACTIVE_LANE_MASK:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n)
; CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32
; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
; CHECK: middle.block:
; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4
entry:
br label %for.body
for.body: ; preds = %entry, %for.inc
%red = phi i32 [ 0, %entry ], [ %storemerge, %for.body ]
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
%load = load i32, i32* %arrayidx6, align 4
%storemerge = add i32 %red, %load
store i32 %storemerge, i32* %dst, align 4
%indvars.iv.next = add nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
for.end.loopexit: ; preds = %for.inc
br label %for.end
for.end: ; preds = %for.end.loopexit
ret void
}
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.vectorize.width", i32 4}
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}

View File

@ -328,8 +328,6 @@ define void @invariant_store(i32* %dst, i32* readonly %src) {
; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
; CHECK: middle.block:
; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])