forked from OSchip/llvm-project
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF.
* Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double *a, double *b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077
This commit is contained in:
parent
adc37145de
commit
d568cff696
|
@ -879,6 +879,10 @@ public:
|
|||
Type *ResultType,
|
||||
const Twine &Name = "");
|
||||
|
||||
/// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
|
||||
/// will be the same type as that of \p Scaling.
|
||||
Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
|
||||
|
||||
/// Create a call to intrinsic \p ID with 1 operand which is mangled on its
|
||||
/// type.
|
||||
CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
|
||||
|
|
|
@ -80,6 +80,17 @@ static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
|
|||
return CI;
|
||||
}
|
||||
|
||||
Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
|
||||
Module *M = GetInsertBlock()->getParent()->getParent();
|
||||
assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
|
||||
Function *TheFn =
|
||||
Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
|
||||
CallInst *CI = createCallHelper(TheFn, {}, this, Name);
|
||||
return cast<ConstantInt>(Scaling)->getSExtValue() == 1
|
||||
? CI
|
||||
: CreateMul(CI, Scaling);
|
||||
}
|
||||
|
||||
CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
|
||||
MaybeAlign Align, bool isVolatile,
|
||||
MDNode *TBAATag, MDNode *ScopeTag,
|
||||
|
|
|
@ -1121,6 +1121,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
|
|||
return R;
|
||||
}
|
||||
|
||||
/// Return a value for Step multiplied by VF.
|
||||
static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
|
||||
assert(isa<ConstantInt>(Step) && "Expected an integer step");
|
||||
Constant *StepVal = ConstantInt::get(
|
||||
Step->getType(),
|
||||
cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
|
||||
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
|
||||
void reportVectorizationFailure(const StringRef DebugMsg,
|
||||
|
@ -2277,8 +2286,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
|
|||
const InductionDescriptor &ID) {
|
||||
// We shouldn't have to build scalar steps if we aren't vectorizing.
|
||||
assert(VF.isVector() && "VF should be greater than one");
|
||||
assert(!VF.isScalable() &&
|
||||
"the code below assumes a fixed number of elements at compile time");
|
||||
// Get the value type and ensure it and the step have the same integer type.
|
||||
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
|
||||
assert(ScalarIVTy == Step->getType() &&
|
||||
|
@ -2303,11 +2310,24 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
|
|||
Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
|
||||
? 1
|
||||
: VF.getKnownMinValue();
|
||||
assert((!VF.isScalable() || Lanes == 1) &&
|
||||
"Should never scalarize a scalable vector");
|
||||
// Compute the scalar steps and save the results in VectorLoopValueMap.
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
|
||||
auto *StartIdx = getSignedIntOrFpConstant(
|
||||
ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
|
||||
auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
|
||||
ScalarIVTy->getScalarSizeInBits());
|
||||
Value *StartIdx =
|
||||
createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
|
||||
if (ScalarIVTy->isFloatingPointTy())
|
||||
StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
|
||||
StartIdx = addFastMathFlag(Builder.CreateBinOp(
|
||||
AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
|
||||
// The step returned by `createStepForVF` is a runtime-evaluated value
|
||||
// when VF is scalable. Otherwise, it should be folded into a Constant.
|
||||
assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
|
||||
"Expected StartIdx to be folded to a constant when VF is not "
|
||||
"scalable");
|
||||
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
|
||||
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
|
||||
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
|
||||
|
@ -2350,10 +2370,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
|
|||
// is known to be uniform after vectorization, this corresponds to lane zero
|
||||
// of the Part unroll iteration. Otherwise, the last instruction is the one
|
||||
// we created for the last vector lane of the Part unroll iteration.
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
|
||||
? 0
|
||||
: VF.getKnownMinValue() - 1;
|
||||
assert((!VF.isScalable() || LastLane == 0) &&
|
||||
"Scalable vectorization can't lead to any scalarized values.");
|
||||
auto *LastInst = cast<Instruction>(
|
||||
VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
|
||||
|
||||
|
@ -2695,7 +2716,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
|
|||
|
||||
Type *ScalarDataTy = getMemInstValueType(Instr);
|
||||
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
auto *DataTy = VectorType::get(ScalarDataTy, VF);
|
||||
const Align Alignment = getLoadStoreAlignment(Instr);
|
||||
|
||||
|
@ -2728,6 +2748,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
|
|||
InBounds = gep->isInBounds();
|
||||
|
||||
if (Reverse) {
|
||||
assert(!VF.isScalable() &&
|
||||
"Reversing vectors is not yet supported for scalable vectors.");
|
||||
|
||||
// If the address is consecutive but reversed, then the
|
||||
// wide store needs to start at the last vector element.
|
||||
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
|
||||
|
@ -2739,8 +2762,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
|
|||
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
|
||||
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
|
||||
} else {
|
||||
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
|
||||
ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
|
||||
Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
|
||||
PartPtr = cast<GetElementPtrInst>(
|
||||
Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
|
||||
PartPtr->setIsInBounds(InBounds);
|
||||
}
|
||||
|
||||
|
@ -2945,8 +2969,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
|
|||
|
||||
Type *Ty = TC->getType();
|
||||
// This is where we can make the step a runtime constant.
|
||||
assert(!VF.isScalable() && "scalable vectorization is not supported yet");
|
||||
Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
|
||||
Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
|
||||
|
||||
// If the tail is to be folded by masking, round the number of iterations N
|
||||
// up to a multiple of Step instead of rounding down. This is done by first
|
||||
|
@ -2957,6 +2980,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
|
|||
if (Cost->foldTailByMasking()) {
|
||||
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
|
||||
"VF*UF must be a power of 2 when folding tail by masking");
|
||||
assert(!VF.isScalable() &&
|
||||
"Tail folding not yet supported for scalable vectors");
|
||||
TC = Builder.CreateAdd(
|
||||
TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
|
||||
}
|
||||
|
@ -3035,11 +3060,9 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
|
|||
// If tail is to be folded, vector loop takes care of all iterations.
|
||||
Value *CheckMinIters = Builder.getFalse();
|
||||
if (!Cost->foldTailByMasking()) {
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
CheckMinIters = Builder.CreateICmp(
|
||||
P, Count,
|
||||
ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
|
||||
"min.iters.check");
|
||||
Value *Step =
|
||||
createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
|
||||
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
|
||||
}
|
||||
// Create new preheader for vector loop.
|
||||
LoopVectorPreHeader =
|
||||
|
@ -3518,8 +3541,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
|
|||
Value *StartIdx = ConstantInt::get(IdxTy, 0);
|
||||
// The loop step is equal to the vectorization factor (num of SIMD elements)
|
||||
// times the unroll factor (num of SIMD instructions).
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
|
||||
Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
|
||||
Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
|
||||
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
|
||||
Induction =
|
||||
createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
|
||||
|
@ -4365,7 +4388,6 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
|
|||
}
|
||||
|
||||
void InnerLoopVectorizer::fixLCSSAPHIs() {
|
||||
assert(!VF.isScalable() && "the code below assumes fixed width vectors");
|
||||
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
|
||||
if (LCSSAPhi.getNumIncomingValues() == 1) {
|
||||
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
|
||||
|
@ -4376,6 +4398,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
|
|||
cast<Instruction>(IncomingValue), VF)
|
||||
? 0
|
||||
: VF.getKnownMinValue() - 1;
|
||||
assert((!VF.isScalable() || LastLane == 0) &&
|
||||
"scalable vectors dont support non-uniform scalars yet");
|
||||
// Can be a loop invariant incoming value or the last scalar value to be
|
||||
// extracted from the vectorized loop.
|
||||
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
|
||||
|
@ -5528,7 +5552,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
|||
ElementCount
|
||||
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||
ElementCount UserVF) {
|
||||
assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
|
||||
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
|
||||
unsigned SmallestType, WidestType;
|
||||
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
|
||||
|
@ -5541,6 +5564,11 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
|||
unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
|
||||
|
||||
if (UserVF.isNonZero()) {
|
||||
// For now, don't verify legality of scalable vectors.
|
||||
// This will be addressed properly in https://reviews.llvm.org/D91718.
|
||||
if (UserVF.isScalable())
|
||||
return UserVF;
|
||||
|
||||
// If legally unsafe, clamp the user vectorization factor to a safe value.
|
||||
unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
|
||||
if (UserVF.getFixedValue() <= MaxSafeVF)
|
||||
|
@ -5629,6 +5657,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
|||
|
||||
VectorizationFactor
|
||||
LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
|
||||
// FIXME: This can be fixed for scalable vectors later, because at this stage
|
||||
// the LoopVectorizer will only consider vectorizing a loop with scalable
|
||||
// vectors when the loop has a hint to enable vectorization for a given VF.
|
||||
assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
|
||||
|
||||
float Cost = expectedCost(ElementCount::getFixed(1)).first;
|
||||
|
@ -5938,7 +5969,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
|||
}
|
||||
|
||||
// Clamp the interleave ranges to reasonable counts.
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
unsigned MaxInterleaveCount =
|
||||
TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
|
||||
|
||||
|
@ -5954,6 +5984,13 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
|||
// If trip count is known or estimated compile time constant, limit the
|
||||
// interleave count to be less than the trip count divided by VF, provided it
|
||||
// is at least 1.
|
||||
//
|
||||
// For scalable vectors we can't know if interleaving is beneficial. It may
|
||||
// not be beneficial for small loops if none of the lanes in the second vector
|
||||
// iterations is enabled. However, for larger loops, there is likely to be a
|
||||
// similar benefit as for fixed-width vectors. For now, we choose to leave
|
||||
// the InterleaveCount as if vscale is '1', although if some information about
|
||||
// the vector is known (e.g. min vector size), we can make a better decision.
|
||||
if (BestKnownTC) {
|
||||
MaxInterleaveCount =
|
||||
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
|
||||
|
@ -5997,7 +6034,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
|||
// potentially expose ILP opportunities.
|
||||
LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
|
||||
<< "LV: IC is " << IC << '\n'
|
||||
<< "LV: VF is " << VF.getKnownMinValue() << '\n');
|
||||
<< "LV: VF is " << VF << '\n');
|
||||
const bool AggressivelyInterleaveReductions =
|
||||
TTI.enableAggressiveInterleaving(HasReductions);
|
||||
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
|
||||
|
@ -6664,8 +6701,6 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
|
|||
LoopVectorizationCostModel::VectorizationCostTy
|
||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
|
||||
ElementCount VF) {
|
||||
assert(!VF.isScalable() &&
|
||||
"the cost model is not yet implemented for scalable vectorization");
|
||||
// If we know that this instruction will remain uniform, check the cost of
|
||||
// the scalar version.
|
||||
if (isUniformAfterVectorization(I, VF))
|
||||
|
@ -6729,7 +6764,6 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
|
|||
}
|
||||
|
||||
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
|
||||
assert(!VF.isScalable() && "scalable vectors not yet supported.");
|
||||
if (VF.isScalar())
|
||||
return;
|
||||
NumPredStores = 0;
|
||||
|
@ -7316,7 +7350,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
|
|||
|
||||
Optional<VectorizationFactor>
|
||||
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||
assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
|
||||
assert(OrigLoop->isInnermost() && "Inner loop expected.");
|
||||
Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
|
||||
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
|
||||
|
@ -7339,9 +7372,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
|||
ElementCount MaxVF = MaybeMaxVF.getValue();
|
||||
assert(MaxVF.isNonZero() && "MaxVF is zero.");
|
||||
|
||||
if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
|
||||
if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
|
||||
assert(isPowerOf2_32(UserVF.getFixedValue()) &&
|
||||
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
|
||||
"VF needs to be a power of two");
|
||||
// Collect the instructions (and their associated costs) that will be more
|
||||
// profitable to scalarize.
|
||||
|
@ -7352,6 +7385,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
|||
return {{UserVF, 0}};
|
||||
}
|
||||
|
||||
assert(!MaxVF.isScalable() &&
|
||||
"Scalable vectors not yet supported beyond this point");
|
||||
|
||||
for (ElementCount VF = ElementCount::getFixed(1);
|
||||
ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
|
||||
// Collect Uniform and Scalar instructions after vectorization with VF.
|
||||
|
@ -8695,6 +8731,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
|
|||
|
||||
void VPReplicateRecipe::execute(VPTransformState &State) {
|
||||
if (State.Instance) { // Generate a single instance.
|
||||
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
|
||||
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
|
||||
*State.Instance, IsPredicated, State);
|
||||
// Insert scalar instance packing it into a vector.
|
||||
|
@ -8717,6 +8754,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
|||
// instruction is uniform inwhich case generate only the first lane for each
|
||||
// of the UF parts.
|
||||
unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
|
||||
assert((!State.VF.isScalable() || IsUniform) &&
|
||||
"Can't scalarize a scalable vector");
|
||||
for (unsigned Part = 0; Part < State.UF; ++Part)
|
||||
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
|
||||
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
|
||||
|
@ -8870,12 +8909,6 @@ static bool processLoopInVPlanNativePath(
|
|||
|
||||
// Get user vectorization factor.
|
||||
ElementCount UserVF = Hints.getWidth();
|
||||
if (UserVF.isScalable()) {
|
||||
// TODO: Use scalable UserVF once we've added initial support for scalable
|
||||
// vectorization. For now we convert it to fixed width, but this will be
|
||||
// removed in a later patch.
|
||||
UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
|
||||
}
|
||||
|
||||
// Plan how to best vectorize, return the best VF and its cost.
|
||||
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
|
||||
|
@ -9041,13 +9074,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||
|
||||
// Get user vectorization factor and interleave count.
|
||||
ElementCount UserVF = Hints.getWidth();
|
||||
if (UserVF.isScalable()) {
|
||||
// TODO: Use scalable UserVF once we've added initial support for scalable
|
||||
// vectorization. For now we convert it to fixed width, but this will be
|
||||
// removed in a later patch.
|
||||
UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
|
||||
}
|
||||
|
||||
unsigned UserIC = Hints.getInterleave();
|
||||
|
||||
// Plan how to best vectorize, return the best VF and its cost.
|
||||
|
|
|
@ -163,7 +163,6 @@ public:
|
|||
assert(Instance.Part < UF && "Queried Scalar Part is too large.");
|
||||
assert(Instance.Lane < VF.getKnownMinValue() &&
|
||||
"Queried Scalar Lane is too large.");
|
||||
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
|
||||
if (!hasAnyScalarValue(Key))
|
||||
return false;
|
||||
|
|
|
@ -13,8 +13,7 @@ entry:
|
|||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
%0 = trunc i64 %indvars.iv to i32
|
||||
store i32 %0, i32* %arrayidx, align 4
|
||||
store i32 42, i32* %arrayidx, align 4
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
|
@ -25,7 +24,7 @@ for.end: ; preds = %for.body, %entry
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @test2(
|
||||
; CHECK: store <8 x i32>
|
||||
; CHECK: store <vscale x 8 x i32>
|
||||
; CHECK: ret void
|
||||
define void @test2(i32* nocapture %a, i32 %n) #0 {
|
||||
entry:
|
||||
|
@ -35,8 +34,7 @@ entry:
|
|||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
%0 = trunc i64 %indvars.iv to i32
|
||||
store i32 %0, i32* %arrayidx, align 4
|
||||
store i32 42, i32* %arrayidx, align 4
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
|
@ -57,8 +55,7 @@ entry:
|
|||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
%0 = trunc i64 %indvars.iv to i32
|
||||
store i32 %0, i32* %arrayidx, align 4
|
||||
store i32 42, i32* %arrayidx, align 4
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
|
||||
; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
|
||||
|
||||
; CHECKUF1: for.body.preheader:
|
||||
; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
|
||||
; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
|
||||
; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
|
||||
|
||||
; CHECKUF1: vector.ph:
|
||||
; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
|
||||
; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
|
||||
; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
|
||||
|
||||
; CHECKUF1: vector.body:
|
||||
; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
|
||||
; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
|
||||
; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
|
||||
; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
|
||||
; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
|
||||
; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
|
||||
; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
|
||||
; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
|
||||
; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
|
||||
; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
|
||||
|
||||
|
||||
; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
|
||||
; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
|
||||
|
||||
; CHECKUF2: for.body.preheader:
|
||||
; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
|
||||
; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
|
||||
; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
|
||||
|
||||
; CHECKUF2: vector.ph:
|
||||
; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
|
||||
; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
|
||||
; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
|
||||
|
||||
; CHECKUF2: vector.body:
|
||||
; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
|
||||
; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
|
||||
; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
|
||||
; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
|
||||
; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
|
||||
; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
|
||||
; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
|
||||
; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
|
||||
; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
|
||||
; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
|
||||
; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
|
||||
; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
|
||||
; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
|
||||
; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
|
||||
; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
|
||||
; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
|
||||
; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
|
||||
; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
|
||||
; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
|
||||
; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
|
||||
; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
|
||||
; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
|
||||
|
||||
define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
|
||||
entry:
|
||||
%cmp7 = icmp sgt i32 %N, 0
|
||||
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
%wide.trip.count = zext i32 %N to i64
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
|
||||
%0 = load double, double* %arrayidx, align 8
|
||||
%add = fadd double %0, 1.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
|
||||
store double %add, double* %arrayidx2, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
||||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
|
||||
}
|
||||
|
||||
!1 = distinct !{!1, !2, !3}
|
||||
!2 = !{!"llvm.loop.vectorize.width", i32 4}
|
||||
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
|
Loading…
Reference in New Issue