forked from OSchip/llvm-project
[LV] Ignore candidate VFs with invalid costs.
This follows on from discussion on the mailing-list: https://lists.llvm.org/pipermail/llvm-dev/2021-June/151047.html to interpret an Invalid cost as 'infinitely expensive', as this simplifies some of the legalization issues with scalable vectors. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D105473
This commit is contained in:
parent
e4aa6ad132
commit
d2e4ccc790
|
@ -1261,9 +1261,11 @@ public:
|
||||||
const LoopVectorizationPlanner &LVP);
|
const LoopVectorizationPlanner &LVP);
|
||||||
|
|
||||||
/// Setup cost-based decisions for user vectorization factor.
|
/// Setup cost-based decisions for user vectorization factor.
|
||||||
void selectUserVectorizationFactor(ElementCount UserVF) {
|
/// \return true if the UserVF is a feasible VF to be chosen.
|
||||||
|
bool selectUserVectorizationFactor(ElementCount UserVF) {
|
||||||
collectUniformsAndScalars(UserVF);
|
collectUniformsAndScalars(UserVF);
|
||||||
collectInstsToScalarize(UserVF);
|
collectInstsToScalarize(UserVF);
|
||||||
|
return expectedCost(UserVF).first.isValid();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \return The size (in bits) of the smallest and widest types in the code
|
/// \return The size (in bits) of the smallest and widest types in the code
|
||||||
|
@ -5725,8 +5727,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||||
auto MaxSafeUserVF =
|
auto MaxSafeUserVF =
|
||||||
UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
|
UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
|
||||||
|
|
||||||
if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
|
if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
|
||||||
return UserVF;
|
// If `VF=vscale x N` is safe, then so is `VF=N`
|
||||||
|
if (UserVF.isScalable())
|
||||||
|
return FixedScalableVFPair(
|
||||||
|
ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
|
||||||
|
else
|
||||||
|
return UserVF;
|
||||||
|
}
|
||||||
|
|
||||||
assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
|
assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
|
||||||
|
|
||||||
|
@ -6072,17 +6080,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
|
||||||
if (i.isScalar())
|
if (i.isScalar())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Notice that the vector loop needs to be executed less times, so
|
|
||||||
// we need to divide the cost of the vector loops by the width of
|
|
||||||
// the vector elements.
|
|
||||||
VectorizationCostTy C = expectedCost(i);
|
VectorizationCostTy C = expectedCost(i);
|
||||||
|
|
||||||
assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
|
|
||||||
VectorizationFactor Candidate(i, C.first);
|
VectorizationFactor Candidate(i, C.first);
|
||||||
LLVM_DEBUG(
|
LLVM_DEBUG(
|
||||||
dbgs() << "LV: Vector loop of width " << i << " costs: "
|
dbgs() << "LV: Vector loop of width " << i << " costs: "
|
||||||
<< (*Candidate.Cost.getValue() /
|
<< (Candidate.Cost / Candidate.Width.getKnownMinValue())
|
||||||
Candidate.Width.getKnownMinValue())
|
|
||||||
<< (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
|
<< (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
|
||||||
<< ".\n");
|
<< ".\n");
|
||||||
|
|
||||||
|
@ -6109,8 +6111,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
|
||||||
}
|
}
|
||||||
|
|
||||||
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
|
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
|
||||||
*ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
|
ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
|
||||||
dbgs()
|
|
||||||
<< "LV: Vectorization seems to be not beneficial, "
|
<< "LV: Vectorization seems to be not beneficial, "
|
||||||
<< "but was forced by a user.\n");
|
<< "but was forced by a user.\n");
|
||||||
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
|
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
|
||||||
|
@ -6438,8 +6439,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
||||||
// If we did not calculate the cost for VF (because the user selected the VF)
|
// If we did not calculate the cost for VF (because the user selected the VF)
|
||||||
// then we calculate the cost of VF here.
|
// then we calculate the cost of VF here.
|
||||||
if (LoopCost == 0) {
|
if (LoopCost == 0) {
|
||||||
assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
|
InstructionCost C = expectedCost(VF).first;
|
||||||
LoopCost = *expectedCost(VF).first.getValue();
|
assert(C.isValid() && "Expected to have chosen a VF with valid cost");
|
||||||
|
LoopCost = *C.getValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(LoopCost && "Non-zero loop cost expected");
|
assert(LoopCost && "Non-zero loop cost expected");
|
||||||
|
@ -7295,6 +7297,8 @@ InstructionCost
|
||||||
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
|
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
|
||||||
ElementCount VF) const {
|
ElementCount VF) const {
|
||||||
|
|
||||||
|
// There is no mechanism yet to create a scalable scalarization loop,
|
||||||
|
// so this is currently Invalid.
|
||||||
if (VF.isScalable())
|
if (VF.isScalable())
|
||||||
return InstructionCost::getInvalid();
|
return InstructionCost::getInvalid();
|
||||||
|
|
||||||
|
@ -8013,17 +8017,19 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||||
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
|
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
|
||||||
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
|
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
|
||||||
if (!UserVF.isZero() && UserVFIsLegal) {
|
if (!UserVF.isZero() && UserVFIsLegal) {
|
||||||
LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
|
|
||||||
<< " VF " << UserVF << ".\n");
|
|
||||||
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
|
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
|
||||||
"VF needs to be a power of two");
|
"VF needs to be a power of two");
|
||||||
// Collect the instructions (and their associated costs) that will be more
|
// Collect the instructions (and their associated costs) that will be more
|
||||||
// profitable to scalarize.
|
// profitable to scalarize.
|
||||||
CM.selectUserVectorizationFactor(UserVF);
|
if (CM.selectUserVectorizationFactor(UserVF)) {
|
||||||
CM.collectInLoopReductions();
|
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
|
||||||
buildVPlansWithVPRecipes(UserVF, UserVF);
|
CM.collectInLoopReductions();
|
||||||
LLVM_DEBUG(printPlans(dbgs()));
|
buildVPlansWithVPRecipes(UserVF, UserVF);
|
||||||
return {{UserVF, 0}};
|
LLVM_DEBUG(printPlans(dbgs()));
|
||||||
|
return {{UserVF, 0}};
|
||||||
|
} else
|
||||||
|
reportVectorizationInfo("UserVF ignored because of invalid costs.",
|
||||||
|
"InvalidCost", ORE, OrigLoop);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Populate the set of Vectorization Factor Candidates.
|
// Populate the set of Vectorization Factor Candidates.
|
||||||
|
@ -8798,8 +8804,6 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
|
||||||
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
|
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
|
||||||
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
|
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
|
||||||
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
|
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
|
||||||
assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
|
|
||||||
"Either the intrinsic cost or vector call cost must be valid");
|
|
||||||
return UseVectorIntrinsic || !NeedToScalarize;
|
return UseVectorIntrinsic || !NeedToScalarize;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -75,7 +75,7 @@ define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
|
||||||
; CHECK-LABEL: @vec_intrinsic
|
; CHECK-LABEL: @vec_intrinsic
|
||||||
; CHECK: vector.body:
|
; CHECK: vector.body:
|
||||||
; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
|
; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
|
||||||
; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
|
; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
|
||||||
entry:
|
entry:
|
||||||
%cmp7 = icmp sgt i64 %N, 0
|
%cmp7 = icmp sgt i64 %N, 0
|
||||||
br i1 %cmp7, label %for.body, label %for.end
|
br i1 %cmp7, label %for.body, label %for.end
|
||||||
|
@ -95,17 +95,90 @@ for.end:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
|
||||||
|
; CHECK: @vec_sin_no_mapping
|
||||||
|
; CHECK: call fast <2 x float> @llvm.sin.v2f32
|
||||||
|
; CHECK-NOT: <vscale x
|
||||||
|
entry:
|
||||||
|
br label %for.body
|
||||||
|
|
||||||
|
for.body: ; preds = %entry, %for.body
|
||||||
|
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
|
||||||
|
%arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
|
||||||
|
%0 = load float, float* %arrayidx, align 4
|
||||||
|
%1 = tail call fast float @llvm.sin.f32(float %0)
|
||||||
|
%arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
|
||||||
|
store float %1, float* %arrayidx1, align 4
|
||||||
|
%inc = add nuw nsw i64 %i.07, 1
|
||||||
|
%exitcond.not = icmp eq i64 %inc, %n
|
||||||
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
|
||||||
|
|
||||||
|
for.cond.cleanup: ; preds = %for.body
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
|
||||||
|
; CHECK: @vec_sin_fixed_mapping
|
||||||
|
; CHECK: call fast <2 x float> @llvm.sin.v2f32
|
||||||
|
; CHECK-NOT: <vscale x
|
||||||
|
entry:
|
||||||
|
br label %for.body
|
||||||
|
|
||||||
|
for.body: ; preds = %entry, %for.body
|
||||||
|
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
|
||||||
|
%arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
|
||||||
|
%0 = load float, float* %arrayidx, align 4
|
||||||
|
%1 = tail call fast float @llvm.sin.f32(float %0) #3
|
||||||
|
%arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
|
||||||
|
store float %1, float* %arrayidx1, align 4
|
||||||
|
%inc = add nuw nsw i64 %i.07, 1
|
||||||
|
%exitcond.not = icmp eq i64 %inc, %n
|
||||||
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
|
||||||
|
|
||||||
|
for.cond.cleanup: ; preds = %for.body
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Even though there are no function mappings attached to the call
|
||||||
|
; in the loop below we can still vectorize the loop because SVE has
|
||||||
|
; hardware support in the form of the 'fqsrt' instruction.
|
||||||
|
define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
|
||||||
|
; CHECK: @vec_sqrt_no_mapping
|
||||||
|
; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
|
||||||
|
entry:
|
||||||
|
br label %for.body
|
||||||
|
|
||||||
|
for.body: ; preds = %entry, %for.body
|
||||||
|
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
|
||||||
|
%arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
|
||||||
|
%0 = load float, float* %arrayidx, align 4
|
||||||
|
%1 = tail call fast float @llvm.sqrt.f32(float %0)
|
||||||
|
%arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
|
||||||
|
store float %1, float* %arrayidx1, align 4
|
||||||
|
%inc = add nuw nsw i64 %i.07, 1
|
||||||
|
%exitcond.not = icmp eq i64 %inc, %n
|
||||||
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
|
||||||
|
|
||||||
|
for.cond.cleanup: ; preds = %for.body
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
declare double @foo(double)
|
declare double @foo(double)
|
||||||
declare i64 @bar(i64*)
|
declare i64 @bar(i64*)
|
||||||
declare double @llvm.sin.f64(double)
|
declare double @llvm.sin.f64(double)
|
||||||
|
declare float @llvm.sin.f32(float)
|
||||||
|
declare float @llvm.sqrt.f32(float)
|
||||||
|
|
||||||
declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
|
declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
|
||||||
declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
|
declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
|
||||||
declare <vscale x 2 x double> @sin_vec(<vscale x 2 x double>)
|
declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>)
|
||||||
|
declare <2 x double> @sin_vec_v2f64(<2 x double>)
|
||||||
|
|
||||||
attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
|
attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
|
||||||
attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
|
attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
|
||||||
attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" }
|
attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" }
|
||||||
|
attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" }
|
||||||
|
|
||||||
!1 = distinct !{!1, !2, !3}
|
!1 = distinct !{!1, !2, !3}
|
||||||
!2 = !{!"llvm.loop.vectorize.width", i32 2}
|
!2 = !{!"llvm.loop.vectorize.width", i32 2}
|
||||||
|
|
Loading…
Reference in New Issue