From 92028062413907e1c10b16e7c338e0745a11a051 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Tue, 9 Feb 2021 02:12:54 +0000 Subject: [PATCH] Revert "[CostModel] Remove VF from IntrinsicCostAttributes" This reverts commit 502a67dd7f23901834e05071ab253889f671b5d9. This expose a failure in test-suite build on PowerPC, revert to unblock buildbot first, Dave will re-commit in https://reviews.llvm.org/D96287. Thanks Dave. --- .../llvm/Analysis/TargetTransformInfo.h | 38 +++-- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 43 +++-- llvm/lib/Analysis/TargetTransformInfo.cpp | 87 +++++++--- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 33 ++-- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 11 +- .../Transforms/Vectorize/LoopVectorize.cpp | 19 +-- .../Transforms/Vectorize/SLPVectorizer.cpp | 18 +- .../LoopVectorize/AArch64/intrinsiccost.ll | 156 ++++++++---------- .../LoopVectorize/X86/intrinsiccost.ll | 2 +- .../WebAssembly/no-vectorize-rotate.ll | 20 +-- 10 files changed, 236 insertions(+), 191 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 04106eca81b1..992e15500d9c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -118,32 +118,44 @@ class IntrinsicCostAttributes { SmallVector ParamTys; SmallVector Arguments; FastMathFlags FMF; + ElementCount VF = ElementCount::getFixed(1); // If ScalarizationCost is UINT_MAX, the cost of scalarizing the // arguments and the return value will be computed based on types. unsigned ScalarizationCost = std::numeric_limits::max(); public: - IntrinsicCostAttributes( - Intrinsic::ID Id, const CallBase &CI, - unsigned ScalarizationCost = std::numeric_limits::max()); + IntrinsicCostAttributes(const IntrinsicInst &I); - IntrinsicCostAttributes( - Intrinsic::ID Id, Type *RTy, ArrayRef Tys, - FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, - unsigned ScalarCost = std::numeric_limits::max()); + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI); + + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, + ElementCount Factor); + + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, + ElementCount Factor, unsigned ScalarCost); + + IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys, FastMathFlags Flags); + + IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys, FastMathFlags Flags, + unsigned ScalarCost); + + IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys, FastMathFlags Flags, + unsigned ScalarCost, + const IntrinsicInst *I); + + IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys); IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, ArrayRef Args); - IntrinsicCostAttributes( - Intrinsic::ID Id, Type *RTy, ArrayRef Args, - ArrayRef Tys, FastMathFlags Flags = FastMathFlags(), - const IntrinsicInst *I = nullptr, - unsigned ScalarCost = std::numeric_limits::max()); - Intrinsic::ID getID() const { return IID; } const IntrinsicInst *getInst() const { return II; } Type *getReturnType() const { return RetTy; } + ElementCount getVectorFactor() const { return VF; } FastMathFlags getFlags() const { return FMF; } unsigned getScalarizationCost() const { return ScalarizationCost; } const SmallVectorImpl &getArgs() const { return Arguments; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index d5ab2fb6d4f2..75e897c2db91 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1211,9 +1211,12 @@ public: Type *RetTy = ICA.getReturnType(); + ElementCount VF = ICA.getVectorFactor(); ElementCount RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() : ElementCount::getFixed(1)); + assert((RetVF.isScalar() || VF.isScalar()) && + "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); @@ -1223,13 +1226,15 @@ public: case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. - if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) + if (VF.isScalar() && RetVF.isScalar() && + getTLI()->isCheapToSpeculateCttz()) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. - if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) + if (VF.isScalar() && RetVF.isScalar() && + getTLI()->isCheapToSpeculateCtlz()) return TargetTransformInfo::TCC_Basic; break; @@ -1237,14 +1242,16 @@ public: return thisT()->getMemcpyCost(ICA.getInst()); case Intrinsic::masked_scatter: { + assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[3]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[2])->getAlignValue(); return thisT()->getGatherScatterOpCost(Instruction::Store, - ICA.getArgTypes()[0], Args[1], + Args[0]->getType(), Args[1], VarMask, Alignment, CostKind, I); } case Intrinsic::masked_gather: { + assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[2]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[1])->getAlignValue(); @@ -1282,13 +1289,13 @@ public: case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); + IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { IntrinsicCostAttributes Attrs( - IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); + IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::fshl: @@ -1340,20 +1347,32 @@ public: return BaseT::getIntrinsicInstrCost(ICA, CostKind); // Assume that we need to scalarize this intrinsic. + SmallVector Types; + for (const Value *Op : Args) { + Type *OpTy = Op->getType(); + assert(VF.isScalar() || !OpTy->isVectorTy()); + Types.push_back(VF.isScalar() + ? OpTy + : FixedVectorType::get(OpTy, VF.getKnownMinValue())); + } + + if (VF.isVector() && !RetTy->isVoidTy()) + RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue()); + // Compute the scalarization overhead based on Args for a vector - // intrinsic. + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF.isVector()) { + if (RetVF.isVector() || VF.isVector()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, VF.getKnownMinValue()); } - IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, - ScalarizationCost); + IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I); return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } @@ -1596,7 +1615,7 @@ public: // SatMin -> Overflow && SumDiff >= 0 unsigned Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - nullptr, ScalarizationCostPassed); + ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, @@ -1617,7 +1636,7 @@ public: unsigned Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - nullptr, ScalarizationCostPassed); + ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5fde2b471da8..e376d42c15bd 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -54,26 +54,86 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) { return true; } +IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) : + II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) { + + FunctionType *FTy = I.getCalledFunction()->getFunctionType(); + ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); + Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end()); + if (auto *FPMO = dyn_cast(&I)) + FMF = FPMO->getFastMathFlags(); +} + IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, - const CallBase &CI, - unsigned ScalarizationCost) - : II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id), - ScalarizationCost(ScalarizationCost) { + const CallBase &CI) : + II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id) { if (const auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); - FunctionType *FTy = CI.getCalledFunction()->getFunctionType(); + FunctionType *FTy = + CI.getCalledFunction()->getFunctionType(); + ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); +} + +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, + const CallBase &CI, + ElementCount Factor) + : RetTy(CI.getType()), IID(Id), VF(Factor) { + + assert(!Factor.isScalable() && "Scalable vectors are not yet supported"); + if (auto *FPMO = dyn_cast(&CI)) + FMF = FPMO->getFastMathFlags(); + + Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); + FunctionType *FTy = + CI.getCalledFunction()->getFunctionType(); + ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); +} + +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, + const CallBase &CI, + ElementCount Factor, + unsigned ScalarCost) + : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) { + + if (const auto *FPMO = dyn_cast(&CI)) + FMF = FPMO->getFastMathFlags(); + + Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); + FunctionType *FTy = + CI.getCalledFunction()->getFunctionType(); ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); } +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys, + FastMathFlags Flags) : + RetTy(RTy), IID(Id), FMF(Flags) { + ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); +} + IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, ArrayRef Tys, FastMathFlags Flags, - const IntrinsicInst *I, - unsigned ScalarCost) - : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { + unsigned ScalarCost) : + RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { + ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); +} + +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys, + FastMathFlags Flags, + unsigned ScalarCost, + const IntrinsicInst *I) : + II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { + ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); +} + +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Tys) : + RetTy(RTy), IID(Id) { ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); } @@ -87,17 +147,6 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty, ParamTys.push_back(Arguments[Idx]->getType()); } -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Args, - ArrayRef Tys, - FastMathFlags Flags, - const IntrinsicInst *I, - unsigned ScalarCost) - : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { - ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); - Arguments.insert(Arguments.begin(), Args.begin(), Args.end()); -} - bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 7c33f22d29ef..7b8a79640bb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -731,28 +731,40 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + Type *RetTy = ICA.getReturnType(); + unsigned VF = ICA.getVectorFactor().getFixedValue(); unsigned RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getNumElements() : 1); + assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); // Assume that we need to scalarize this intrinsic. + SmallVector Types; + for (const Value *Op : Args) { + Type *OpTy = Op->getType(); + assert(VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = FixedVectorType::get(RetTy, VF); // Compute the scalarization overhead based on Args for a vector // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while // CostModel will pass a vector RetTy and VF is 1. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF > 1) { + if (RetVF > 1 || VF > 1) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); } - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, - ScalarizationCost); + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, + ScalarizationCost, I); return getIntrinsicInstrCost(Attrs, CostKind); } @@ -772,20 +784,9 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // TODO: Get more refined intrinsic costs? unsigned InstRate = getQuarterRateInstrCost(CostKind); - - switch (ICA.getID()) { - case Intrinsic::fma: + if (ICA.getID() == Intrinsic::fma) { InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); - break; - case Intrinsic::uadd_sat: - case Intrinsic::usub_sat: - case Intrinsic::sadd_sat: - case Intrinsic::ssub_sat: - static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; - if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) - NElts = 1; - break; } return LT.first * NElts * InstRate; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index d674dd9000bf..af67839c2d75 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1550,16 +1550,21 @@ int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::usub_sat: { if (!ST->hasMVEIntegerOps()) break; + // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. Type *VT = ICA.getReturnType(); + if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) + VT = VectorType::get(VT, ICA.getVectorFactor()); std::pair LT = TLI->getTypeLegalizationCost(DL, VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) { - // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we + // This is a base cost of 1 for the vadd, plus 3 extract shifts if we // need to extend the type, as it uses shr(qadd(shl, shl)). - unsigned Instrs = - LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; + unsigned Instrs = LT.second.getScalarSizeInBits() == + ICA.getReturnType()->getScalarSizeInBits() + ? 1 + : 4; return LT.first * ST->getMVEVectorCostFactor() * Instrs; } break; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e799a948da1a..d6223415c154 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3803,27 +3803,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) { - auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); - }; - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = MaybeVectorizeType(CI->getType(), VF); - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(CI)) - FMF = FPMO->getFastMathFlags(); - SmallVector Arguments(CI->arg_begin(), CI->arg_end()); - FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); - SmallVector ParamTys; - std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(), - [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); - - IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, - dyn_cast(CI)); + IntrinsicCostAttributes CostAttrs(ID, *CI, VF); return TTI.getIntrinsicInstrCost(CostAttrs, TargetTransformInfo::TCK_RecipThroughput); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d21151d9a909..0b630197911a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3417,16 +3417,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector VecTys; - for (Use &Arg : CI->args()) - VecTys.push_back( - FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); - FastMathFlags FMF; - if (auto *FPCI = dyn_cast(CI)) - FMF = FPCI->getFastMathFlags(); - SmallVector Arguments(CI->arg_begin(), CI->arg_end()); - IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, - dyn_cast(CI)); + IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); @@ -3437,6 +3428,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, auto LibCost = IntrinsicCost; if (!CI->isNoBuiltin() && VecFunc) { // Calculate the cost of the vector library call. + SmallVector VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); + // If the corresponding vector call is cheaper, return its cost. LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, TTI::TCK_RecipThroughput); @@ -3802,7 +3798,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1); + IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index 59c736885eba..b86a7da0daff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu" ; CHECK-COST-LABEL: sadd ; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @saddsat( @@ -21,38 +21,29 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590 ; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>* +; CHECK-NEXT: store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -66,10 +57,10 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]] @@ -99,10 +90,10 @@ while.end: ; preds = %while.body, %entry ; CHECK-COST-LABEL: umin ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @umin( @@ -116,87 +107,78 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1 -; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584 -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1 +; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584 +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] -; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]] -; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]] +; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]] -; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2 -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2 -; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2 +; CHECK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index 5e213f18ebe5..fccdf9b1cc2a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -157,7 +157,7 @@ while.end: ; preds = %while.body, %entry ; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @cttz( diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll index ad8e35bf9478..042f03840613 100644 --- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll +++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll @@ -4,22 +4,20 @@ ; Regression test for a bug in the SLP vectorizer that was causing ; these rotates to be incorrectly combined into a vector rotate. +; The bug fix is at https://reviews.llvm.org/D85759. This test has +; been pre-committed to demonstrate the regressed behavior and provide +; a clear diff for the bug fix. + target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0 -; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 -; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64 -; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]]) -; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8 -; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1 -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64 -; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]]) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1 -; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; %a = extractelement <2 x i64> %x, i32 0