[CostModel] Unify Intrinsic Costs.

Recommitting most of the remaining changes from
259eb619ff, but excluding the call to
getUserCost from getInstructionThroughput. Though there's still no
test changes, I doubt that this is an NFC...

With the two getIntrinsicInstrCosts folded into one, now fold in the
scalar/code-size orientated getIntrinsicCost. The remaining scalar
intrinsics were memcpy, cttz and ctlz which now have special handling
in the BasicTTI implementation.

This had required a change in the AMDGPU backend for fabs as it
should always be 'free'. I've also changed the X86 backend to return
the BaseT implementation when the CostKind isn't RecipThroughput.

Differential Revision: https://reviews.llvm.org/D80012
This commit is contained in:
Sam Parker 2020-05-26 09:23:18 +01:00
parent 64cfb8a864
commit 871556a494
7 changed files with 73 additions and 118 deletions

View File

@ -38,6 +38,7 @@ class AssumptionCache;
class BlockFrequencyInfo;
class DominatorTree;
class BranchInst;
class CallBase;
class Function;
class GlobalValue;
class IntrinsicInst;
@ -120,10 +121,12 @@ class IntrinsicCostAttributes {
public:
IntrinsicCostAttributes(const IntrinsicInst &I);
IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
unsigned Factor);
IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
unsigned Factor, unsigned ScalarCost);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
@ -141,7 +144,7 @@ public:
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Value *> Args);
Intrinsic::ID getID() const { return IID; }
@ -288,18 +291,6 @@ public:
/// scientific. A target may has no bonus on vector instructions.
int getInlinerVectorBonusPercent() const;
/// Estimate the cost of an intrinsic when lowered.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys,
const User *U = nullptr,
TTI::TargetCostKind CostKind = TCK_SizeAndLatency) const;
/// Estimate the cost of an intrinsic when lowered.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments,
const User *U = nullptr,
TTI::TargetCostKind CostKind = TCK_SizeAndLatency) const;
/// \return the expected cost of a memcpy, which could e.g. depend on the
/// source/destination type and alignment and the number of bytes copied.
int getMemcpyCost(const Instruction *I) const;
@ -1231,13 +1222,6 @@ public:
TTI::TargetCostKind CostKind) = 0;
virtual unsigned getInliningThresholdMultiplier() = 0;
virtual int getInlinerVectorBonusPercent() = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys, const User *U,
enum TargetCostKind CostKind) = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments,
const User *U,
enum TargetCostKind CostKind) = 0;
virtual int getMemcpyCost(const Instruction *I) = 0;
virtual unsigned
getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
@ -1495,18 +1479,6 @@ public:
int getInlinerVectorBonusPercent() override {
return Impl.getInlinerVectorBonusPercent();
}
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys,
const User *U = nullptr,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) override {
return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U, CostKind);
}
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments,
const User *U = nullptr,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) override {
return Impl.getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
}
int getMemcpyCost(const Instruction *I) override {
return Impl.getMemcpyCost(I);
}

View File

@ -772,36 +772,8 @@ public:
return TTI::TCC_Basic;
}
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys, const User *U,
TTI::TargetCostKind CostKind) {
switch (IID) {
default:
break;
// TODO: other libc intrinsics.
case Intrinsic::memcpy:
return static_cast<T *>(this)->getMemcpyCost(dyn_cast<Instruction>(U));
}
IntrinsicCostAttributes Attrs(IID, RetTy, ParamTys);
return getIntrinsicInstrCost(Attrs, CostKind);
}
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments, const User *U,
TTI::TargetCostKind CostKind) {
// Delegate to the generic intrinsic handling code. This mostly provides an
// opportunity for targets to (for example) special case the cost of
// certain intrinsics based on constants used as arguments.
SmallVector<Type *, 8> ParamTys;
ParamTys.reserve(Arguments.size());
for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
ParamTys.push_back(Arguments[Idx]->getType());
return static_cast<T *>(this)->getIntrinsicCost(IID, RetTy, ParamTys, U,
CostKind);
}
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind CostKind) {
int getUserCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind CostKind) {
auto *TargetTTI = static_cast<T *>(this);
// FIXME: Unlikely to be true for anything but CodeSize.
@ -810,9 +782,8 @@ public:
if (F) {
FunctionType *FTy = F->getFunctionType();
if (Intrinsic::ID IID = F->getIntrinsicID()) {
SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
return TargetTTI->getIntrinsicCost(IID, FTy->getReturnType(),
ParamTys, U, CostKind);
IntrinsicCostAttributes Attrs(IID, *CB);
return TargetTTI->getIntrinsicInstrCost(Attrs, CostKind);
}
if (!TargetTTI->isLoweredToCall(F))

View File

@ -296,30 +296,6 @@ public:
return BaseT::getGEPCost(PointeeType, Ptr, Operands);
}
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments, const User *U,
TTI::TargetCostKind CostKind) {
return BaseT::getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
}
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys, const User *U,
TTI::TargetCostKind CostKind) {
if (IID == Intrinsic::cttz) {
if (getTLI()->isCheapToSpeculateCttz())
return TargetTransformInfo::TCC_Basic;
return TargetTransformInfo::TCC_Expensive;
}
if (IID == Intrinsic::ctlz) {
if (getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;
return TargetTransformInfo::TCC_Expensive;
}
return BaseT::getIntrinsicCost(IID, RetTy, ParamTys, U, CostKind);
}
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JumpTableSize,
ProfileSummaryInfo *PSI,
@ -1090,6 +1066,28 @@ public:
/// Get intrinsic cost based on arguments.
unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
Intrinsic::ID IID = ICA.getID();
auto *ConcreteTTI = static_cast<T *>(this);
// Special case some scalar intrinsics.
if (CostKind != TTI::TCK_RecipThroughput) {
switch (IID) {
default:
break;
case Intrinsic::cttz:
if (getTLI()->isCheapToSpeculateCttz())
return TargetTransformInfo::TCC_Basic;
break;
case Intrinsic::ctlz:
if (getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;
break;
case Intrinsic::memcpy:
return ConcreteTTI->getMemcpyCost(ICA.getInst());
// TODO: other libc intrinsics.
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
return 0;
@ -1098,17 +1096,14 @@ public:
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
Intrinsic::ID IID = ICA.getID();
const IntrinsicInst *I = ICA.getInst();
Type *RetTy = ICA.getReturnType();
const SmallVectorImpl<Value *> &Args = ICA.getArgs();
unsigned VF = ICA.getVectorFactor();
FastMathFlags FMF = ICA.getFlags();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getNumElements() : 1);
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
auto *ConcreteTTI = static_cast<T *>(this);
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
switch (IID) {
default: {
@ -1595,13 +1590,14 @@ public:
CostKind) +
ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
CostKind);
if (IID == Intrinsic::experimental_constrained_fmuladd)
return ConcreteTTI->getIntrinsicCost(
Intrinsic::experimental_constrained_fmul, RetTy, Tys, nullptr,
CostKind) +
ConcreteTTI->getIntrinsicCost(
Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr,
CostKind);
if (IID == Intrinsic::experimental_constrained_fmuladd) {
IntrinsicCostAttributes FMulAttrs(
Intrinsic::experimental_constrained_fmul, RetTy, Tys);
IntrinsicCostAttributes FAddAttrs(
Intrinsic::experimental_constrained_fadd, RetTy, Tys);
return ConcreteTTI->getIntrinsicInstrCost(FMulAttrs, CostKind) +
ConcreteTTI->getIntrinsicInstrCost(FAddAttrs, CostKind);
}
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it

View File

@ -63,7 +63,20 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
FMF = FPMO->getFastMathFlags();
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI) :
II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id) {
if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
FunctionType *FTy =
CI.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
unsigned Factor) :
RetTy(CI.getType()), IID(Id), VF(Factor) {
@ -76,7 +89,8 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
unsigned Factor,
unsigned ScalarCost) :
RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
@ -236,15 +250,6 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
return TTIImpl->getGEPCost(PointeeType, Ptr, Operands, CostKind);
}
int TargetTransformInfo::getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments,
const User *U,
TTI::TargetCostKind CostKind) const {
int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) const {

View File

@ -560,6 +560,9 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
if (ICA.getID() == Intrinsic::fabs)
return 0;
if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
return BaseT::getIntrinsicInstrCost(ICA, CostKind);

View File

@ -2699,6 +2699,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
@ -3932,6 +3935,9 @@ int X86TTIImpl::getGatherScatterOpCost(
unsigned Alignment, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) {
if (CostKind != TTI::TCK_RecipThroughput)
return 1;
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());

View File

@ -1535,7 +1535,7 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
// %inc = add nsw %i.0, 1
// br i1 %tobool
const Value *Args[] =
Value *Args[] =
{InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
: ConstantInt::getFalse(InitX->getContext())};
@ -1544,9 +1544,11 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
uint32_t HeaderSize =
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
int Cost =
TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
if (HeaderSize != IdiomCanonicalSize &&
TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
TargetTransformInfo::TCC_Basic)
Cost > TargetTransformInfo::TCC_Basic)
return false;
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,