Revert "[CostModel] Remove VF from IntrinsicCostAttributes"

This reverts commit 502a67dd7f.

This expose a failure in test-suite build on PowerPC,
revert to unblock buildbot first,
Dave will re-commit in https://reviews.llvm.org/D96287.

Thanks Dave.
This commit is contained in:
Jinsong Ji 2021-02-09 02:12:54 +00:00
parent 21e8bb8325
commit 9202806241
10 changed files with 236 additions and 191 deletions

View File

@ -118,32 +118,44 @@ class IntrinsicCostAttributes {
SmallVector<Type *, 4> ParamTys;
SmallVector<const Value *, 4> Arguments;
FastMathFlags FMF;
ElementCount VF = ElementCount::getFixed(1);
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
public:
IntrinsicCostAttributes(
Intrinsic::ID Id, const CallBase &CI,
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max());
IntrinsicCostAttributes(const IntrinsicInst &I);
IntrinsicCostAttributes(
Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
unsigned ScalarCost = std::numeric_limits<unsigned>::max());
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
ElementCount Factor);
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
ElementCount Factor, unsigned ScalarCost);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys, FastMathFlags Flags);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys, FastMathFlags Flags,
unsigned ScalarCost);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys, FastMathFlags Flags,
unsigned ScalarCost,
const IntrinsicInst *I);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<const Value *> Args);
IntrinsicCostAttributes(
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
const IntrinsicInst *I = nullptr,
unsigned ScalarCost = std::numeric_limits<unsigned>::max());
Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
Type *getReturnType() const { return RetTy; }
ElementCount getVectorFactor() const { return VF; }
FastMathFlags getFlags() const { return FMF; }
unsigned getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }

View File

@ -1211,9 +1211,12 @@ public:
Type *RetTy = ICA.getReturnType();
ElementCount VF = ICA.getVectorFactor();
ElementCount RetVF =
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
: ElementCount::getFixed(1));
assert((RetVF.isScalar() || VF.isScalar()) &&
"VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
@ -1223,13 +1226,15 @@ public:
case Intrinsic::cttz:
// FIXME: If necessary, this should go in target-specific overrides.
if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
if (VF.isScalar() && RetVF.isScalar() &&
getTLI()->isCheapToSpeculateCttz())
return TargetTransformInfo::TCC_Basic;
break;
case Intrinsic::ctlz:
// FIXME: If necessary, this should go in target-specific overrides.
if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
if (VF.isScalar() && RetVF.isScalar() &&
getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;
break;
@ -1237,14 +1242,16 @@ public:
return thisT()->getMemcpyCost(ICA.getInst());
case Intrinsic::masked_scatter: {
assert(VF.isScalar() && "Can't vectorize types here.");
const Value *Mask = Args[3];
bool VarMask = !isa<Constant>(Mask);
Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
return thisT()->getGatherScatterOpCost(Instruction::Store,
ICA.getArgTypes()[0], Args[1],
Args[0]->getType(), Args[1],
VarMask, Alignment, CostKind, I);
}
case Intrinsic::masked_gather: {
assert(VF.isScalar() && "Can't vectorize types here.");
const Value *Mask = Args[2];
bool VarMask = !isa<Constant>(Mask);
Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
@ -1282,13 +1289,13 @@ public:
case Intrinsic::vector_reduce_fmin:
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: {
IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
case Intrinsic::vector_reduce_fadd:
case Intrinsic::vector_reduce_fmul: {
IntrinsicCostAttributes Attrs(
IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
case Intrinsic::fshl:
@ -1340,20 +1347,32 @@ public:
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
// Assume that we need to scalarize this intrinsic.
SmallVector<Type *, 4> Types;
for (const Value *Op : Args) {
Type *OpTy = Op->getType();
assert(VF.isScalar() || !OpTy->isVectorTy());
Types.push_back(VF.isScalar()
? OpTy
: FixedVectorType::get(OpTy, VF.getKnownMinValue()));
}
if (VF.isVector() && !RetTy->isVoidTy())
RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
// Compute the scalarization overhead based on Args for a vector
// intrinsic.
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
if (RetVF.isVector()) {
if (RetVF.isVector() || VF.isVector()) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost +=
getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue());
getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
}
IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
ScalarizationCost);
IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
@ -1596,7 +1615,7 @@ public:
// SatMin -> Overflow && SumDiff >= 0
unsigned Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
nullptr, ScalarizationCostPassed);
ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
@ -1617,7 +1636,7 @@ public:
unsigned Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
nullptr, ScalarizationCostPassed);
ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,

View File

@ -54,26 +54,86 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
return true;
}
IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) {
FunctionType *FTy = I.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end());
if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
FMF = FPMO->getFastMathFlags();
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
unsigned ScalarizationCost)
: II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
ScalarizationCost(ScalarizationCost) {
const CallBase &CI) :
II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id) {
if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
FunctionType *FTy = CI.getCalledFunction()->getFunctionType();
FunctionType *FTy =
CI.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
ElementCount Factor)
: RetTy(CI.getType()), IID(Id), VF(Factor) {
assert(!Factor.isScalable() && "Scalable vectors are not yet supported");
if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
FunctionType *FTy =
CI.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
ElementCount Factor,
unsigned ScalarCost)
: RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
FunctionType *FTy =
CI.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys,
FastMathFlags Flags) :
RetTy(RTy), IID(Id), FMF(Flags) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys,
FastMathFlags Flags,
const IntrinsicInst *I,
unsigned ScalarCost)
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
unsigned ScalarCost) :
RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys,
FastMathFlags Flags,
unsigned ScalarCost,
const IntrinsicInst *I) :
II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys) :
RetTy(RTy), IID(Id) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
}
@ -87,17 +147,6 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Arguments[Idx]->getType());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys,
FastMathFlags Flags,
const IntrinsicInst *I,
unsigned ScalarCost)
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
}
bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
LoopInfo &LI, DominatorTree &DT,
bool ForceNestedLoop,

View File

@ -731,28 +731,40 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
Type *RetTy = ICA.getReturnType();
unsigned VF = ICA.getVectorFactor().getFixedValue();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
SmallVector<Type *, 4> Types;
for (const Value *Op : Args) {
Type *OpTy = Op->getType();
assert(VF == 1 || !OpTy->isVectorTy());
Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
}
if (VF > 1 && !RetTy->isVoidTy())
RetTy = FixedVectorType::get(RetTy, VF);
// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
if (RetVF > 1) {
if (RetVF > 1 || VF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF);
ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
}
IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
ScalarizationCost);
IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
ScalarizationCost, I);
return getIntrinsicInstrCost(Attrs, CostKind);
}
@ -772,20 +784,9 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost(CostKind);
switch (ICA.getID()) {
case Intrinsic::fma:
if (ICA.getID() == Intrinsic::fma) {
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
break;
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat:
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
NElts = 1;
break;
}
return LT.first * NElts * InstRate;

View File

@ -1550,16 +1550,21 @@ int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
case Intrinsic::usub_sat: {
if (!ST->hasMVEIntegerOps())
break;
// Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
Type *VT = ICA.getReturnType();
if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
VT = VectorType::get(VT, ICA.getVectorFactor());
std::pair<int, MVT> LT =
TLI->getTypeLegalizationCost(DL, VT);
if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
LT.second == MVT::v16i8) {
// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
// need to extend the type, as it uses shr(qadd(shl, shl)).
unsigned Instrs =
LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
unsigned Instrs = LT.second.getScalarSizeInBits() ==
ICA.getReturnType()->getScalarSizeInBits()
? 1
: 4;
return LT.first * ST->getMVEVectorCostFactor() * Instrs;
}
break;

View File

@ -3803,27 +3803,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * {
if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
return Elt;
return VectorType::get(Elt, VF);
};
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
SmallVector<Type *> ParamTys;
std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(),
[&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast<IntrinsicInst>(CI));
IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
return TTI.getIntrinsicInstrCost(CostAttrs,
TargetTransformInfo::TCK_RecipThroughput);
}

View File

@ -3417,16 +3417,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
SmallVector<Type *, 4> VecTys;
for (Use &Arg : CI->args())
VecTys.push_back(
FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
FastMathFlags FMF;
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
FMF = FPCI->getFastMathFlags();
SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
dyn_cast<IntrinsicInst>(CI));
IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
@ -3437,6 +3428,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
auto LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
SmallVector<Type *, 4> VecTys;
for (Use &Arg : CI->args())
VecTys.push_back(
FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
TTI::TCK_RecipThroughput);
@ -3802,7 +3798,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
InstructionCost ScalarEltCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {

View File

@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
; CHECK-COST-LABEL: sadd
; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @saddsat(
@ -21,38 +21,29 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>*
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]])
; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]])
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>*
; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]])
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>*
; CHECK-NEXT: store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@ -66,10 +57,10 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]])
; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
; CHECK-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1
; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]]
@ -99,10 +90,10 @@ while.end: ; preds = %while.body, %entry
; CHECK-COST-LABEL: umin
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @umin(
@ -116,87 +107,78 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]])
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>*
; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>*
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
; CHECK-NEXT: [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1
; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1
; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]]
; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]]
; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>*
; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2
; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]])
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>*
; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2
; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]]
; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>*
; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]])
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>*
; CHECK-NEXT: store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2
; CHECK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]]
; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]])
; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]]

View File

@ -157,7 +157,7 @@ while.end: ; preds = %while.body, %entry
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @cttz(

View File

@ -4,22 +4,20 @@
; Regression test for a bug in the SLP vectorizer that was causing
; these rotates to be incorrectly combined into a vector rotate.
; The bug fix is at https://reviews.llvm.org/D85759. This test has
; been pre-committed to demonstrate the regressed behavior and provide
; a clear diff for the bug fix.
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64
; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8
; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64
; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1
; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8
; CHECK-NEXT: ret void
;
%a = extractelement <2 x i64> %x, i32 0