[LV] Ignore candidate VFs with invalid costs.

This follows on from discussion on the mailing-list: https://lists.llvm.org/pipermail/llvm-dev/2021-June/151047.html to interpret an Invalid cost as 'infinitely expensive', as this simplifies some of the legalization issues with scalable vectors. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D105473
2021-07-12 09:32:15 +01:00 · 2021-07-12 09:32:15 +01:00 · d2e4ccc790
parent e4aa6ad132
commit d2e4ccc790
2 changed files with 103 additions and 26 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1261,9 +1261,11 @@ public:
                                    const LoopVectorizationPlanner &LVP);
  /// Setup cost-based decisions for user vectorization factor.
-  void selectUserVectorizationFactor(ElementCount UserVF) {
+  /// \return true if the UserVF is a feasible VF to be chosen.
  bool selectUserVectorizationFactor(ElementCount UserVF) {
    collectUniformsAndScalars(UserVF);
    collectInstsToScalarize(UserVF);
    return expectedCost(UserVF).first.isValid();
  }
  /// \return The size (in bits) of the smallest and widest types in the code
@ -5725,8 +5727,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
    auto MaxSafeUserVF =
        UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
-    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
+    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
-      return UserVF;
+      // If `VF=vscale x N` is safe, then so is `VF=N`
      if (UserVF.isScalable())
        return FixedScalableVFPair(
            ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
      else
        return UserVF;
    }
    assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
@ -6072,17 +6080,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
    if (i.isScalar())
      continue;
    // Notice that the vector loop needs to be executed less times, so
    // we need to divide the cost of the vector loops by the width of
    // the vector elements.
    VectorizationCostTy C = expectedCost(i);
    assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
    VectorizationFactor Candidate(i, C.first);
    LLVM_DEBUG(
        dbgs() << "LV: Vector loop of width " << i << " costs: "
-               << (*Candidate.Cost.getValue() /
+               << (Candidate.Cost / Candidate.Width.getKnownMinValue())
                   Candidate.Width.getKnownMinValue())
               << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
               << ".\n");
@ -6109,8 +6111,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
  }
  LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
-                 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
+                 ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
                 dbgs()
             << "LV: Vectorization seems to be not beneficial, "
             << "but was forced by a user.\n");
  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
@ -6438,8 +6439,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
  // If we did not calculate the cost for VF (because the user selected the VF)
  // then we calculate the cost of VF here.
  if (LoopCost == 0) {
-    assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
+    InstructionCost C = expectedCost(VF).first;
-    LoopCost = *expectedCost(VF).first.getValue();
+    assert(C.isValid() && "Expected to have chosen a VF with valid cost");
    LoopCost = *C.getValue();
  }
  assert(LoopCost && "Non-zero loop cost expected");
@ -7295,6 +7297,8 @@ InstructionCost
 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                     ElementCount VF) const {
  // There is no mechanism yet to create a scalable scalarization loop,
  // so this is currently Invalid.
  if (VF.isScalable())
    return InstructionCost::getInvalid();
@ -8013,17 +8017,19 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
      UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
  if (!UserVF.isZero() && UserVFIsLegal) {
    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
                      << " VF " << UserVF << ".\n");
    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
           "VF needs to be a power of two");
    // Collect the instructions (and their associated costs) that will be more
    // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
+    if (CM.selectUserVectorizationFactor(UserVF)) {
-    CM.collectInLoopReductions();
+      LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+      CM.collectInLoopReductions();
-    LLVM_DEBUG(printPlans(dbgs()));
+      buildVPlansWithVPRecipes(UserVF, UserVF);
-    return {{UserVF, 0}};
+      LLVM_DEBUG(printPlans(dbgs()));
      return {{UserVF, 0}};
    } else
      reportVectorizationInfo("UserVF ignored because of invalid costs.",
                              "InvalidCost", ORE, OrigLoop);
  }
  // Populate the set of Vectorization Factor Candidates.
@ -8798,8 +8804,6 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
    InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
    InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
    bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
    assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
           "Either the intrinsic cost or vector call cost must be valid");
    return UseVectorIntrinsic || !NeedToScalarize;
  };
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@ -75,7 +75,7 @@ define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
 ; CHECK-LABEL: @vec_intrinsic
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
 entry:
  %cmp7 = icmp sgt i64 %N, 0
  br i1 %cmp7, label %for.body, label %for.end
@ -95,17 +95,90 @@ for.end:
  ret void
 }
 define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
 ; CHECK-NOT: <vscale x
 entry:
  br label %for.body
 for.body:                                         ; preds = %entry, %for.body
  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
  %0 = load float, float* %arrayidx, align 4
  %1 = tail call fast float @llvm.sin.f32(float %0)
  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
  store float %1, float* %arrayidx1, align 4
  %inc = add nuw nsw i64 %i.07, 1
  %exitcond.not = icmp eq i64 %inc, %n
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
 for.cond.cleanup:                                 ; preds = %for.body
  ret void
 }
 define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_fixed_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
 ; CHECK-NOT: <vscale x
 entry:
  br label %for.body
 for.body:                                         ; preds = %entry, %for.body
  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
  %0 = load float, float* %arrayidx, align 4
  %1 = tail call fast float @llvm.sin.f32(float %0) #3
  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
  store float %1, float* %arrayidx1, align 4
  %inc = add nuw nsw i64 %i.07, 1
  %exitcond.not = icmp eq i64 %inc, %n
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
 for.cond.cleanup:                                 ; preds = %for.body
  ret void
 }
 ; Even though there are no function mappings attached to the call
 ; in the loop below we can still vectorize the loop because SVE has
 ; hardware support in the form of the 'fqsrt' instruction.
 define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
 ; CHECK: @vec_sqrt_no_mapping
 ; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
 entry:
  br label %for.body
 for.body:                                         ; preds = %entry, %for.body
  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
  %0 = load float, float* %arrayidx, align 4
  %1 = tail call fast float @llvm.sqrt.f32(float %0)
  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
  store float %1, float* %arrayidx1, align 4
  %inc = add nuw nsw i64 %i.07, 1
  %exitcond.not = icmp eq i64 %inc, %n
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
 for.cond.cleanup:                                 ; preds = %for.body
  ret void
 }
 declare double @foo(double)
 declare i64 @bar(i64*)
 declare double @llvm.sin.f64(double)
 declare float @llvm.sin.f32(float)
 declare float @llvm.sqrt.f32(float)
 declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
 declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
-declare <vscale x 2 x double> @sin_vec(<vscale x 2 x double>)
+declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>)
 declare <2 x double> @sin_vec_v2f64(<2 x double>)
 attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
 attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
-attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" }
+attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" }
 attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" }
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.vectorize.width", i32 2}