forked from OSchip/llvm-project
[LV] FoldTail w/o Primary Induction
Introduce a new VPWidenCanonicalIVRecipe to generate a canonical vector induction for use in fold-tail-with-masking, if a primary induction is absent. The canonical scalar IV having start = 0 and step = VF*UF, created during code -gen to control the vector loop, is widened into a canonical vector IV having start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF} and step = <VF*UF, VF*UF, ..., VF*UF>. Differential Revision: https://reviews.llvm.org/D77635
This commit is contained in:
parent
023c4d400e
commit
1678489234
|
@ -1233,15 +1233,6 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
|
|||
|
||||
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
|
||||
|
||||
if (!PrimaryInduction) {
|
||||
reportVectorizationFailure(
|
||||
"No primary induction, cannot fold tail by masking",
|
||||
"Missing a primary induction variable in the loop, which is "
|
||||
"needed in order to fold tail by masking as required.",
|
||||
"NoPrimaryInduction", ORE, TheLoop);
|
||||
return false;
|
||||
}
|
||||
|
||||
SmallPtrSet<const Value *, 8> ReductionLiveOuts;
|
||||
|
||||
for (auto &Reduction : getReductionVars())
|
||||
|
|
|
@ -6585,6 +6585,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
|
|||
&ILV, CallbackILV};
|
||||
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
|
||||
State.TripCount = ILV.getOrCreateTripCount(nullptr);
|
||||
State.CanonicalIV = ILV.Induction;
|
||||
|
||||
//===------------------------------------------------===//
|
||||
//
|
||||
|
@ -6771,7 +6772,15 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
|
|||
|
||||
// Introduce the early-exit compare IV <= BTC to form header block mask.
|
||||
// This is used instead of IV < TC because TC may wrap, unlike BTC.
|
||||
VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
|
||||
// Start by constructing the desired canonical IV.
|
||||
VPValue *IV = nullptr;
|
||||
if (Legal->getPrimaryInduction())
|
||||
IV = Plan->getVPValue(Legal->getPrimaryInduction());
|
||||
else {
|
||||
auto IVRecipe = new VPWidenCanonicalIVRecipe();
|
||||
Builder.getInsertBlock()->appendRecipe(IVRecipe);
|
||||
IV = IVRecipe->getVPValue();
|
||||
}
|
||||
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
|
||||
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
|
||||
return BlockMaskCache[BB] = BlockMask;
|
||||
|
@ -7130,12 +7139,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
|
|||
NeedDef.insert(Branch->getCondition());
|
||||
}
|
||||
|
||||
// If the tail is to be folded by masking, the primary induction variable
|
||||
// needs to be represented in VPlan for it to model early-exit masking.
|
||||
// If the tail is to be folded by masking, the primary induction variable, if
|
||||
// exists needs to be represented in VPlan for it to model early-exit masking.
|
||||
// Also, both the Phi and the live-out instruction of each reduction are
|
||||
// required in order to introduce a select between them in VPlan.
|
||||
if (CM.foldTailByMasking()) {
|
||||
NeedDef.insert(Legal->getPrimaryInduction());
|
||||
if (Legal->getPrimaryInduction())
|
||||
NeedDef.insert(Legal->getPrimaryInduction());
|
||||
for (auto &Reduction : Legal->getReductionVars()) {
|
||||
NeedDef.insert(Reduction.first);
|
||||
NeedDef.insert(Reduction.second.getLoopExitInstr());
|
||||
|
@ -7572,9 +7582,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
|
|||
!PreferPredicateOverEpilog;
|
||||
|
||||
// 2) Next, if disabling predication is requested on the command line, honour
|
||||
// this and request a scalar epilogue. Also do this if we don't have a
|
||||
// primary induction variable, which is required for predication.
|
||||
if (PredicateOptDisabled || !LVL.getPrimaryInduction())
|
||||
// this and request a scalar epilogue.
|
||||
if (PredicateOptDisabled)
|
||||
return CM_ScalarEpilogueAllowed;
|
||||
|
||||
// 3) and 4) look if enabling predication is requested on the command line,
|
||||
|
|
|
@ -802,6 +802,29 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|||
O << "\\l\"";
|
||||
}
|
||||
|
||||
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
|
||||
Value *CanonicalIV = State.CanonicalIV;
|
||||
Type *STy = CanonicalIV->getType();
|
||||
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
|
||||
Value *VStart = Builder.CreateVectorSplat(State.VF, CanonicalIV, "broadcast");
|
||||
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
|
||||
SmallVector<Constant *, 8> Indices;
|
||||
for (unsigned Lane = 0, VF = State.VF; Lane < VF; ++Lane)
|
||||
Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
|
||||
Constant *VStep = ConstantVector::get(Indices);
|
||||
// Add the consecutive indices to the vector value.
|
||||
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
|
||||
State.set(getVPValue(), CanonicalVectorIV, Part);
|
||||
}
|
||||
}
|
||||
|
||||
void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
|
||||
VPSlotTracker &SlotTracker) const {
|
||||
O << " +\n" << Indent << "\"EMIT ";
|
||||
getVPValue()->printAsOperand(O, SlotTracker);
|
||||
O << " = WIDEN-CANONICAL-INDUCTION \\l\"";
|
||||
}
|
||||
|
||||
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
|
||||
|
||||
void VPValue::replaceAllUsesWith(VPValue *New) {
|
||||
|
@ -901,6 +924,8 @@ void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
|
|||
for (const VPRecipeBase &Recipe : *VPBB) {
|
||||
if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
|
||||
assignSlot(VPI);
|
||||
else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe))
|
||||
assignSlot(VPIV->getVPValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -329,6 +329,9 @@ struct VPTransformState {
|
|||
/// Values they correspond to.
|
||||
VPValue2ValueTy VPValue2Value;
|
||||
|
||||
/// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
|
||||
Value *CanonicalIV = nullptr;
|
||||
|
||||
/// Hold the trip count of the scalar loop.
|
||||
Value *TripCount = nullptr;
|
||||
|
||||
|
@ -610,6 +613,7 @@ public:
|
|||
VPPredInstPHISC,
|
||||
VPReplicateSC,
|
||||
VPWidenCallSC,
|
||||
VPWidenCanonicalIVSC,
|
||||
VPWidenGEPSC,
|
||||
VPWidenIntOrFpInductionSC,
|
||||
VPWidenMemoryInstructionSC,
|
||||
|
@ -1144,6 +1148,36 @@ public:
|
|||
VPSlotTracker &SlotTracker) const override;
|
||||
};
|
||||
|
||||
/// A Recipe for widening the canonical induction variable of the vector loop.
|
||||
class VPWidenCanonicalIVRecipe : public VPRecipeBase {
|
||||
private:
|
||||
/// A VPValue representing the canonical vector IV.
|
||||
VPValue Val;
|
||||
|
||||
public:
|
||||
VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {}
|
||||
~VPWidenCanonicalIVRecipe() override = default;
|
||||
|
||||
/// Return the VPValue representing the canonical vector induction variable of
|
||||
/// the vector loop.
|
||||
const VPValue *getVPValue() const { return &Val; }
|
||||
VPValue *getVPValue() { return &Val; }
|
||||
|
||||
/// Method to support type inquiry through isa, cast, and dyn_cast.
|
||||
static inline bool classof(const VPRecipeBase *V) {
|
||||
return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC;
|
||||
}
|
||||
|
||||
/// Generate a canonical vector induction variable of the vector loop, with
|
||||
/// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
|
||||
/// step = <VF*UF, VF*UF, ..., VF*UF>.
|
||||
void execute(VPTransformState &State) override;
|
||||
|
||||
/// Print the recipe.
|
||||
void print(raw_ostream &O, const Twine &Indent,
|
||||
VPSlotTracker &SlotTracker) const override;
|
||||
};
|
||||
|
||||
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
|
||||
/// holds a sequence of zero or more VPRecipe's each representing a sequence of
|
||||
/// output IR instructions.
|
||||
|
|
|
@ -168,9 +168,17 @@ define void @example2(i32 %n, i32 %x) optsize {
|
|||
ret void
|
||||
}
|
||||
|
||||
; N is unknown, we need a tail. Can't vectorize because loop has no primary
|
||||
; induction.
|
||||
; Loop has no primary induction as its integer IV has step -1 starting at
|
||||
; unknown N, but can still be vectorized.
|
||||
;CHECK-LABEL: @example3(
|
||||
; CHECK: vector.ph:
|
||||
; CHECK: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> {{.*}}, <4 x i64> undef, <4 x i32> zeroinitializer
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0,
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[VPIV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK: {{.*}} = icmp ule <4 x i64> [[VPIV]], [[BROADCAST_SPLAT2]]
|
||||
;CHECK-NOT: <4 x i32>
|
||||
;CHECK: ret void
|
||||
define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
|
||||
|
@ -237,12 +245,12 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
|
|||
; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: br label [[TMP6:%.*]]
|
||||
; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
|
||||
; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !11
|
||||
; CHECK: ret void
|
||||
;
|
||||
br label %1
|
||||
|
@ -353,12 +361,12 @@ define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst
|
|||
; CHECK: pred.store.continue22:
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
|
||||
; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
|
||||
; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: br label [[TMP33:%.*]]
|
||||
; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
|
||||
; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !13
|
||||
; CHECK: ret void
|
||||
;
|
||||
br label %1
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
|
||||
; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s
|
||||
|
||||
; Check that when we can't predicate this loop that it is still vectorised (with
|
||||
; an epilogue).
|
||||
; TODO: the reason this can't be predicated is because a primary induction
|
||||
; variable can't be found (not yet) for this counting down loop. But with that
|
||||
; fixed, this should be able to be predicated.
|
||||
; Check that a counting-down loop which has no primary induction variable
|
||||
; is vectorized with preferred predication.
|
||||
|
||||
; CHECK-LABEL: vector.body:
|
||||
; CHECK-LABEL: middle.block:
|
||||
; CHECK-NEXT: br i1 true,
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
|
||||
|
|
Loading…
Reference in New Issue