Revert "[CostModel] Remove VF from IntrinsicCostAttributes"

This reverts commit 502a67dd7f. This expose a failure in test-suite build on PowerPC, revert to unblock buildbot first, Dave will re-commit in https://reviews.llvm.org/D96287. Thanks Dave.
2021-02-09 02:12:54 +00:00 · 2021-02-09 02:12:54 +00:00 · 9202806241
parent 21e8bb8325
commit 9202806241
10 changed files with 236 additions and 191 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -118,32 +118,44 @@ class IntrinsicCostAttributes {
  SmallVector<Type *, 4> ParamTys;
  SmallVector<const Value *, 4> Arguments;
  FastMathFlags FMF;
+  ElementCount VF = ElementCount::getFixed(1);
  // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
  // arguments and the return value will be computed based on types.
  unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();

 public:
-  IntrinsicCostAttributes(
-      Intrinsic::ID Id, const CallBase &CI,
-      unsigned ScalarizationCost = std::numeric_limits<unsigned>::max());
+  IntrinsicCostAttributes(const IntrinsicInst &I);

-  IntrinsicCostAttributes(
-      Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
-      FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
-      unsigned ScalarCost = std::numeric_limits<unsigned>::max());
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
+                          ElementCount Factor);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
+                          ElementCount Factor, unsigned ScalarCost);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                          ArrayRef<Type *> Tys, FastMathFlags Flags);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                          ArrayRef<Type *> Tys, FastMathFlags Flags,
+                          unsigned ScalarCost);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                          ArrayRef<Type *> Tys, FastMathFlags Flags,
+                          unsigned ScalarCost,
+                          const IntrinsicInst *I);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                          ArrayRef<Type *> Tys);

  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                          ArrayRef<const Value *> Args);

-  IntrinsicCostAttributes(
-      Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
-      ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
-      const IntrinsicInst *I = nullptr,
-      unsigned ScalarCost = std::numeric_limits<unsigned>::max());
-
  Intrinsic::ID getID() const { return IID; }
  const IntrinsicInst *getInst() const { return II; }
  Type *getReturnType() const { return RetTy; }
+  ElementCount getVectorFactor() const { return VF; }
  FastMathFlags getFlags() const { return FMF; }
  unsigned getScalarizationCost() const { return ScalarizationCost; }
  const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -1211,9 +1211,12 @@ public:

    Type *RetTy = ICA.getReturnType();

+    ElementCount VF = ICA.getVectorFactor();
    ElementCount RetVF =
        (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
                             : ElementCount::getFixed(1));
+    assert((RetVF.isScalar() || VF.isScalar()) &&
+           "VF > 1 and RetVF is a vector type");
    const IntrinsicInst *I = ICA.getInst();
    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
    FastMathFlags FMF = ICA.getFlags();
@ -1223,13 +1226,15 @@ public:

    case Intrinsic::cttz:
      // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
+      if (VF.isScalar() && RetVF.isScalar() &&
+          getTLI()->isCheapToSpeculateCttz())
        return TargetTransformInfo::TCC_Basic;
      break;

    case Intrinsic::ctlz:
      // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
+      if (VF.isScalar() && RetVF.isScalar() &&
+          getTLI()->isCheapToSpeculateCtlz())
        return TargetTransformInfo::TCC_Basic;
      break;

@ -1237,14 +1242,16 @@ public:
      return thisT()->getMemcpyCost(ICA.getInst());

    case Intrinsic::masked_scatter: {
+      assert(VF.isScalar() && "Can't vectorize types here.");
      const Value *Mask = Args[3];
      bool VarMask = !isa<Constant>(Mask);
      Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
      return thisT()->getGatherScatterOpCost(Instruction::Store,
-                                             ICA.getArgTypes()[0], Args[1],
+                                             Args[0]->getType(), Args[1],
                                             VarMask, Alignment, CostKind, I);
    }
    case Intrinsic::masked_gather: {
+      assert(VF.isScalar() && "Can't vectorize types here.");
      const Value *Mask = Args[2];
      bool VarMask = !isa<Constant>(Mask);
      Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
@ -1282,13 +1289,13 @@ public:
    case Intrinsic::vector_reduce_fmin:
    case Intrinsic::vector_reduce_umax:
    case Intrinsic::vector_reduce_umin: {
-      IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
+      IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
    }
    case Intrinsic::vector_reduce_fadd:
    case Intrinsic::vector_reduce_fmul: {
      IntrinsicCostAttributes Attrs(
-          IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
+          IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
    }
    case Intrinsic::fshl:
@ -1340,20 +1347,32 @@ public:
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);

    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF.isScalar() || !OpTy->isVectorTy());
+      Types.push_back(VF.isScalar()
+                          ? OpTy
+                          : FixedVectorType::get(OpTy, VF.getKnownMinValue()));
+    }
+
+    if (VF.isVector() && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
+
    // Compute the scalarization overhead based on Args for a vector
-    // intrinsic.
+    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+    // CostModel will pass a vector RetTy and VF is 1.
    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
-    if (RetVF.isVector()) {
+    if (RetVF.isVector() || VF.isVector()) {
      ScalarizationCost = 0;
      if (!RetTy->isVoidTy())
        ScalarizationCost +=
            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
      ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue());
+          getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
    }

-    IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
-                                  ScalarizationCost);
+    IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
    return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  }

@ -1596,7 +1615,7 @@ public:
      // SatMin -> Overflow && SumDiff >= 0
      unsigned Cost = 0;
      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
-                                    nullptr, ScalarizationCostPassed);
+                                    ScalarizationCostPassed);
      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
      Cost +=
          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
@ -1617,7 +1636,7 @@ public:

      unsigned Cost = 0;
      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
-                                    nullptr, ScalarizationCostPassed);
+                                    ScalarizationCostPassed);
      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
      Cost +=
          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -54,26 +54,86 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
  return true;
 }

+IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
+    II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) {
+
+ FunctionType *FTy = I.getCalledFunction()->getFunctionType();
+ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+ Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end());
+ if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+   FMF = FPMO->getFastMathFlags();
+}
+
 IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
-                                                 const CallBase &CI,
-                                                 unsigned ScalarizationCost)
-    : II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
-      ScalarizationCost(ScalarizationCost) {
+                                                 const CallBase &CI) :
+  II(dyn_cast<IntrinsicInst>(&CI)),  RetTy(CI.getType()), IID(Id) {

  if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
    FMF = FPMO->getFastMathFlags();

  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
-  FunctionType *FTy = CI.getCalledFunction()->getFunctionType();
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
+                                                 ElementCount Factor)
+    : RetTy(CI.getType()), IID(Id), VF(Factor) {
+
+  assert(!Factor.isScalable() && "Scalable vectors are not yet supported");
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
+                                                 ElementCount Factor,
+                                                 unsigned ScalarCost)
+    : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
+
+  if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
 }

+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys,
+                                                 FastMathFlags Flags) :
+    RetTy(RTy), IID(Id), FMF(Flags) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
 IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                                                 ArrayRef<Type *> Tys,
                                                 FastMathFlags Flags,
-                                                 const IntrinsicInst *I,
-                                                 unsigned ScalarCost)
-    : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+                                                 unsigned ScalarCost) :
+    RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys,
+                                                 FastMathFlags Flags,
+                                                 unsigned ScalarCost,
+                                                 const IntrinsicInst *I) :
+    II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys) :
+    RetTy(RTy), IID(Id) {
  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
 }

@ -87,17 +147,6 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
    ParamTys.push_back(Arguments[Idx]->getType());
 }

-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
-                                                 ArrayRef<const Value *> Args,
-                                                 ArrayRef<Type *> Tys,
-                                                 FastMathFlags Flags,
-                                                 const IntrinsicInst *I,
-                                                 unsigned ScalarCost)
-    : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
-  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
-  Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
-}
-
 bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
                                               LoopInfo &LI, DominatorTree &DT,
                                               bool ForceNestedLoop,
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -731,28 +731,40 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
    if (ICA.isTypeBasedOnly())
      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);

+    Type *RetTy = ICA.getReturnType();
+    unsigned VF = ICA.getVectorFactor().getFixedValue();
    unsigned RetVF =
        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
                             : 1);
+    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
    const IntrinsicInst *I = ICA.getInst();
    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
    FastMathFlags FMF = ICA.getFlags();
    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF == 1 || !OpTy->isVectorTy());
+      Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+    }
+
+    if (VF > 1 && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF);

    // Compute the scalarization overhead based on Args for a vector
    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
    // CostModel will pass a vector RetTy and VF is 1.
    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
-    if (RetVF > 1) {
+    if (RetVF > 1 || VF > 1) {
      ScalarizationCost = 0;
      if (!RetTy->isVoidTy())
        ScalarizationCost +=
            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
-      ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF);
+      ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
    }

-    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
-                                  ScalarizationCost);
+    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+                                  ScalarizationCost, I);
    return getIntrinsicInstrCost(Attrs, CostKind);
  }

@ -772,20 +784,9 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

  // TODO: Get more refined intrinsic costs?
  unsigned InstRate = getQuarterRateInstrCost(CostKind);
-
-  switch (ICA.getID()) {
-  case Intrinsic::fma:
+  if (ICA.getID() == Intrinsic::fma) {
    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
                                   : getQuarterRateInstrCost(CostKind);
-    break;
-  case Intrinsic::uadd_sat:
-  case Intrinsic::usub_sat:
-  case Intrinsic::sadd_sat:
-  case Intrinsic::ssub_sat:
-    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
-    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
-      NElts = 1;
-    break;
  }

  return LT.first * NElts * InstRate;
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -1550,16 +1550,21 @@ int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  case Intrinsic::usub_sat: {
    if (!ST->hasMVEIntegerOps())
      break;
+    // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
    Type *VT = ICA.getReturnType();
+    if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
+      VT = VectorType::get(VT, ICA.getVectorFactor());

    std::pair<int, MVT> LT =
        TLI->getTypeLegalizationCost(DL, VT);
    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
        LT.second == MVT::v16i8) {
-      // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
+      // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
      // need to extend the type, as it uses shr(qadd(shl, shl)).
-      unsigned Instrs =
-          LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
+      unsigned Instrs = LT.second.getScalarSizeInBits() ==
+                                ICA.getReturnType()->getScalarSizeInBits()
+                            ? 1
+                            : 4;
      return LT.first * ST->getMVEVectorCostFactor() * Instrs;
    }
    break;
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -3803,27 +3803,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
 InstructionCost
 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                   ElementCount VF) {
-  auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * {
-    if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-      return Elt;
-    return VectorType::get(Elt, VF);
-  };
-
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  assert(ID && "Expected intrinsic call!");
-  Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
-  FastMathFlags FMF;
-  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-    FMF = FPMO->getFastMathFlags();

-  SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
-  FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
-  SmallVector<Type *> ParamTys;
-  std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(),
-                 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
-
-  IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
-                                    dyn_cast<IntrinsicInst>(CI));
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
  return TTI.getIntrinsicInstrCost(CostAttrs,
                                   TargetTransformInfo::TCK_RecipThroughput);
 }
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -3417,16 +3417,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

  // Calculate the cost of the scalar and vector calls.
-  SmallVector<Type *, 4> VecTys;
-  for (Use &Arg : CI->args())
-    VecTys.push_back(
-        FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
-  FastMathFlags FMF;
-  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
-    FMF = FPCI->getFastMathFlags();
-  SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
-  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
-                                    dyn_cast<IntrinsicInst>(CI));
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
  auto IntrinsicCost =
    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

@ -3437,6 +3428,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
  auto LibCost = IntrinsicCost;
  if (!CI->isNoBuiltin() && VecFunc) {
    // Calculate the cost of the vector library call.
+    SmallVector<Type *, 4> VecTys;
+    for (Use &Arg : CI->args())
+      VecTys.push_back(
+          FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
    // If the corresponding vector call is cheaper, return its cost.
    LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
                                    TTI::TCK_RecipThroughput);
@ -3802,7 +3798,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

      // Calculate the cost of the scalar and vector calls.
-      IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+      IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
      InstructionCost ScalarEltCost =
          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
      if (NeedToShuffleReuses) {
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"

 ; CHECK-COST-LABEL: sadd
 ; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)

 define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @saddsat(
@ -21,38 +21,29 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590
 ; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>*
+; CHECK-NEXT:    store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@ -66,10 +57,10 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
 ; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]])
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]])
 ; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT:    store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT:    store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]]
@ -99,10 +90,10 @@ while.end:                                        ; preds = %while.body, %entry

 ; CHECK-COST-LABEL: umin
 ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)

 define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @umin(
@ -116,87 +107,78 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]]
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1
-; CHECK-NEXT:    [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1
+; CHECK-NEXT:    [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
-; CHECK-NEXT:    [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]])
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]]
-; CHECK-NEXT:    br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT:    [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]])
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]])
 ; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT:    store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT:    store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]]
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@ -157,7 +157,7 @@ while.end:                                        ; preds = %while.body, %entry
 ; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
 ; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
 ; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)

 define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @cttz(
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@ -4,22 +4,20 @@
 ; Regression test for a bug in the SLP vectorizer that was causing
 ; these rotates to be incorrectly combined into a vector rotate.

+; The bug fix is at https://reviews.llvm.org/D85759. This test has
+; been pre-committed to demonstrate the regressed behavior and provide
+; a clear diff for the bug fix.
+
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"

 define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
-; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[B]] to i64
-; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
-; CHECK-NEXT:    store i64 [[C]], i64* [[OUT:%.*]], align 8
-; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
-; CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[E]] to i64
-; CHECK-NEXT:    [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1
-; CHECK-NEXT:    store i64 [[F]], i64* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
  %a = extractelement <2 x i64> %x, i32 0