[VPlan] Add first VPlan version of sinkScalarOperands.

This patch adds a first VPlan-based implementation of sinking of scalar
operands.

The current version traverse a VPlan once and processes all operands of
a predicated REPLICATE recipe. If one of those operands can be sunk,
it is moved to the block containing the predicated REPLICATE recipe.
Continue with processing the operands of the sunk recipe.

The initial version does not re-process candidates after other recipes
have been sunk. It also cannot partially sink induction increments at
the moment. The VPlan only contains WIDEN-INDUCTION recipes and if the
induction is used for example in a GEP, only the first lane is used and
in the lowered IR the adds for the other lanes can be sunk into the
predicated blocks.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D100258
This commit is contained in:
Florian Hahn 2021-05-24 14:14:08 +01:00
parent 5d7c1d8f33
commit 65d3dd7c88
No known key found for this signature in database
GPG Key ID: 61D7554B5CECDC0D
8 changed files with 108 additions and 45 deletions

View File

@ -4599,12 +4599,22 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
while (!Worklist.empty()) {
auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
// We can't sink an instruction if it is a phi node, is already in the
// predicated block, is not in the loop, or may have side effects.
if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
!VectorLoop->contains(I) || I->mayHaveSideEffects())
// We can't sink an instruction if it is a phi node, is not in the loop,
// or may have side effects.
if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
I->mayHaveSideEffects())
continue;
// If the instruction is already in PredBB, check if we can sink its
// operands. In that case, VPlan's sinkScalarOperands() succeeded in
// sinking the scalar instruction I, hence it appears in PredBB; but it
// may have failed to sink I's operands (recursively), which we try
// (again) here.
if (I->getParent() == PredBB) {
Worklist.insert(I->op_begin(), I->op_end());
continue;
}
// It's legal to sink the instruction if all its uses occur in the
// predicated block. Otherwise, there's nothing to do yet, and we may
// need to reanalyze the instruction.
@ -9245,6 +9255,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
VPlanTransforms::sinkScalarOperands(*Plan);
std::string PlanName;
raw_string_ostream RSO(PlanName);
ElementCount VF = Range.Start;

View File

@ -99,3 +99,52 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
}
}
}
bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
auto Iter = depth_first(
VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
bool Changed = false;
// First, collect the operands of all predicated replicate recipes as seeds
// for sinking.
SetVector<VPValue *> WorkList;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (auto &Recipe : *VPBB) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&Recipe);
if (!RepR || !RepR->isPredicated())
continue;
WorkList.insert(RepR->op_begin(), RepR->op_end());
}
}
// Try to sink each replicate recipe in the worklist.
while (!WorkList.empty()) {
auto *C = WorkList.pop_back_val();
auto *SinkCandidate = dyn_cast_or_null<VPReplicateRecipe>(C->Def);
if (!SinkCandidate)
continue;
// All users of SinkCandidate must be in the same block in order to perform
// sinking. Therefore the destination block for sinking must match the block
// containing the first user.
auto *FirstUser = dyn_cast<VPRecipeBase>(*SinkCandidate->user_begin());
if (!FirstUser)
continue;
VPBasicBlock *SinkTo = FirstUser->getParent();
if (SinkCandidate->getParent() == SinkTo ||
SinkCandidate->mayHaveSideEffects() ||
SinkCandidate->mayReadOrWriteMemory())
continue;
// All recipe users of the sink candidate must be in the same block SinkTo.
if (any_of(SinkCandidate->users(), [SinkTo](VPUser *U) {
auto *UI = dyn_cast<VPRecipeBase>(U);
return !UI || UI->getParent() != SinkTo;
}))
continue;
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
WorkList.insert(SinkCandidate->op_begin(), SinkCandidate->op_end());
Changed = true;
}
return Changed;
}

View File

@ -28,6 +28,8 @@ struct VPlanTransforms {
Loop *OrigLoop, VPlanPtr &Plan,
LoopVectorizationLegality::InductionList &Inductions,
SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
static bool sinkScalarOperands(VPlan &Plan);
};
} // namespace llvm

View File

@ -567,40 +567,40 @@ define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
; CHECK: pred.store.if17:
; CHECK-NEXT: [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
; CHECK-NEXT: store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP8]] to i32
; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
; CHECK-NEXT: store i32 [[TMP23]], i32* [[NEXT_GEP8]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]]
; CHECK: pred.store.continue18:
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
; CHECK: pred.store.if19:
; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 2
; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
; CHECK-NEXT: store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[INDEX]], 2
; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP25]]
; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP12]] to i32
; CHECK-NEXT: [[TMP27:%.*]] = shl nuw nsw i32 [[TMP26]], 7
; CHECK-NEXT: store i32 [[TMP27]], i32* [[NEXT_GEP9]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]]
; CHECK: pred.store.continue20:
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
; CHECK: pred.store.if21:
; CHECK-NEXT: [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
; CHECK-NEXT: [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3
; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
; CHECK-NEXT: store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[INDEX]], 3
; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP29]]
; CHECK-NEXT: [[TMP30:%.*]] = zext i16 [[TMP16]] to i32
; CHECK-NEXT: [[TMP31:%.*]] = shl nuw nsw i32 [[TMP30]], 7
; CHECK-NEXT: store i32 [[TMP31]], i32* [[NEXT_GEP10]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]]
; CHECK: pred.store.continue22:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4

View File

@ -16,7 +16,6 @@ define void @sink_replicate_region_1(i32 %x, i8* %ptr) optsize {
; CHECK-NEXT: Successor(s): loop.0
; CHECK: loop.0:
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
; CHECK-NEXT: Successor(s): pred.load
; CHECK: <xVFxUF> pred.load: {
@ -26,6 +25,7 @@ define void @sink_replicate_region_1(i32 %x, i8* %ptr) optsize {
; CHECK-NEXT: CondBit: vp<%3> (loop)
; CHECK: pred.load.if:
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep> (S->V)
; CHECK-NEXT: Successor(s): pred.load.continue
@ -106,8 +106,6 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, i32* %ptr) optsize {
; CHECK-NEXT: }
; CHECK: loop.0.split:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%6>, ir<%recur.next>
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
; CHECK-NEXT: Successor(s): pred.store
; CHECK: <xVFxUF> pred.store: {
@ -117,6 +115,8 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, i32* %ptr) optsize {
; CHECK-NEXT: CondBit: vp<%3> (loop)
; CHECK: pred.store.if:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%6>, ir<%recur.next>
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep>
; CHECK-NEXT: Successor(s): pred.store.continue

View File

@ -130,11 +130,11 @@ define i32 @test(i32* nocapture %f) #0 {
; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
; VEC-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
; VEC: pred.store.if1:
; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
; VEC-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], 20
; VEC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
; VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP11]]
; VEC-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4
; VEC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1
; VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP9]]
; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
; VEC-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP11]], 20
; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4
; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]]
; VEC: pred.store.continue2:
; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
@ -565,12 +565,12 @@ define void @minimal_bit_widths(i1 %c) {
; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.if2:
; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
; VEC-NEXT: [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
; VEC-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
; VEC-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1
; VEC-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP13]]
; VEC-NEXT: store i8 [[TMP12]], i8* [[TMP14]], align 1
; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
; VEC-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]]
; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
; VEC-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
; VEC-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP11]], align 1
; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.continue3:
; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2

View File

@ -14,8 +14,6 @@ define void @sink_with_sideeffects(i1 %c, i8* %ptr) {
; CHECK-NEXT: CLONE ir<%tmp2> = getelementptr ir<%ptr>, ir<%tmp0>
; CHECK-NEXT: CLONE ir<%tmp3> = load ir<%tmp2>
; CHECK-NEXT: CLONE store ir<0>, ir<%tmp2>
; CHECK-NEXT: CLONE ir<%tmp4> = zext ir<%tmp3>
; CHECK-NEXT: CLONE ir<%tmp5> = trunc ir<%tmp4>
; CHECK-NEXT: Successor(s): if.then
; CHECK: if.then:
@ -28,6 +26,8 @@ define void @sink_with_sideeffects(i1 %c, i8* %ptr) {
; CHECK-NEXT: CondBit: ir<%c>
; CHECK: pred.store.if:
; CHECK-NEXT: CLONE ir<%tmp4> = zext ir<%tmp3>
; CHECK-NEXT: CLONE ir<%tmp5> = trunc ir<%tmp4>
; CHECK-NEXT: CLONE store ir<%tmp5>, ir<%tmp2>
; CHECK-NEXT: Successor(s): pred.store.continue

View File

@ -14,7 +14,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: loop:
; CHECK-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: Successor(s): pred.load
; CHECK: <xVFxUF> pred.load: {
@ -24,6 +23,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.load.if:
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b>
; CHECK-NEXT: Successor(s): pred.load.continue
@ -33,9 +33,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: }
; CHECK: loop.0:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%5>, ir<10>
; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add>
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: Successor(s): pred.store
; CHECK: <xVFxUF> pred.store: {
@ -45,6 +42,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.store.if:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%5>, ir<10>
; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add>
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a>
; CHECK-NEXT: Successor(s): pred.store.continue
@ -85,7 +85,6 @@ exit:
; CHECK-NEXT: loop:
; CHECK-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: Successor(s): pred.load
; CHECK: <xVFxUF> pred.load: {
@ -95,6 +94,7 @@ exit:
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.load.if:
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b>
; CHECK-NEXT: Successor(s): pred.load.continue
@ -104,9 +104,7 @@ exit:
; CHECK-NEXT: }
; CHECK: loop.0:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%5>, ir<10>
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%indvars.iv>, ir<2>
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
; CHECK-NEXT: Successor(s): pred.store
; CHECK: <xVFxUF> pred.store: {
@ -116,6 +114,8 @@ exit:
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.store.if:
; CHECK-NEXT: REPLICATE ir<%add> = add vp<%5>, ir<10>
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a>
; CHECK-NEXT: Successor(s): pred.store.continue
@ -156,7 +156,6 @@ exit:
; CHECK-NEXT: loop:
; CHECK-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: Successor(s): pred.load
; CHECK: <xVFxUF> pred.load: {
@ -166,6 +165,7 @@ exit:
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.load.if:
; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V)
; CHECK-NEXT: Successor(s): pred.load.continue
@ -177,7 +177,6 @@ exit:
; CHECK: loop.0:
; CHECK-NEXT: WIDEN ir<%add> = add vp<%5>, ir<10>
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%indvars.iv>, ir<%add>
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
; CHECK-NEXT: Successor(s): pred.store
; CHECK: <xVFxUF> pred.store: {
@ -187,6 +186,7 @@ exit:
; CHECK-NEXT: CondBit: vp<%2> (loop)
; CHECK: pred.store.if:
; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a>
; CHECK-NEXT: Successor(s): pred.store.continue