From 4bdf1aa416b023d65fe2eb3ba5740f928be1842b Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 13 Apr 2018 20:46:50 +0000 Subject: [PATCH] [Hexagon] Initial instruction cost model for auto-vectorization llvm-svn: 330065 --- .../Hexagon/HexagonTargetTransformInfo.cpp | 175 ++++++++++++++++-- .../Hexagon/HexagonTargetTransformInfo.h | 118 ++++-------- 2 files changed, 195 insertions(+), 98 deletions(-) diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 764e8b8b7889..ded0adb97a34 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -16,6 +16,7 @@ #include "HexagonTargetTransformInfo.h" #include "HexagonSubtarget.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/User.h" @@ -27,16 +28,35 @@ using namespace llvm; #define DEBUG_TYPE "hexagontti" -static cl::opt HexagonAutoHVX("hexagon-autohvx", cl::init(false), +static cl::opt HexagonAutoHVX("hexagon-autohvx", cl::init(true), cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); static cl::opt EmitLookupTables("hexagon-emit-lookup-tables", cl::init(true), cl::Hidden, cl::desc("Control lookup table emission on Hexagon target")); + +bool HexagonTTIImpl::useHVX() const { + return ST.useHVXOps() && HexagonAutoHVX; +} + +bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { + assert(VecTy->isVectorTy()); + // Avoid types like <2 x i32*>. + if (!cast(VecTy)->getElementType()->isIntegerTy()) + return false; + EVT VecVT = EVT::getEVT(VecTy); + if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64) + return false; + if (ST.isHVXVectorType(VecVT.getSimpleVT())) + return true; + auto Action = TLI.getPreferredVectorAction(VecVT); + return Action == TargetLoweringBase::TypeWidenVector; +} + TargetTransformInfo::PopcntSupportKind HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { - // Return Fast Hardware support as every input < 64 bits will be promoted + // Return fast hardware support as every input < 64 bits will be promoted // to 64 bits. return TargetTransformInfo::PSK_FastHardware; } @@ -58,14 +78,16 @@ bool HexagonTTIImpl::shouldFavorPostInc() const { return true; } +/// --- Vector TTI begin --- + unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { if (Vector) - return HexagonAutoHVX && getST()->useHVXOps() ? 32 : 0; + return useHVX() ? 32 : 0; return 32; } unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) { - return HexagonAutoHVX && getST()->useHVXOps() ? 64 : 0; + return useHVX() ? 2 : 0; } unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const { @@ -73,38 +95,161 @@ unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const { } unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { - return getST()->useHVXOps() ? getST()->getVectorLength()*8 : 0; + return useHVX() ? ST.getVectorLength()*8 : 0; } unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const { - return (8 * getST()->getVectorLength()) / ElemWidth; + return (8 * ST.getVectorLength()) / ElemWidth; +} + +unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) { + return BaseT::getScalarizationOverhead(Ty, Insert, Extract); +} + +unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( + ArrayRef Args, unsigned VF) { + return BaseT::getOperandsScalarizationOverhead(Args, VF); +} + +unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, + ArrayRef Tys) { + return BaseT::getCallInstrCost(F, RetTy, Tys); +} + +unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +} + +unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + if (ID == Intrinsic::bswap) { + std::pair LT = TLI.getTypeLegalizationCost(DL, RetTy); + return LT.first + 2; + } + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); +} + +unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp, + ScalarEvolution *SE, const SCEV *S) { + return 0; } unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { - if (Opcode == Instruction::Load && Src->isVectorTy()) { + assert(Opcode == Instruction::Load || Opcode == Instruction::Store); + if (Opcode == Instruction::Store) + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); + + if (Src->isVectorTy()) { VectorType *VecTy = cast(Src); unsigned VecWidth = VecTy->getBitWidth(); - if (VecWidth > 64) { - // Assume that vectors longer than 64 bits are meant for HVX. - if (getNumberOfRegisters(true) > 0) { - if (VecWidth % getRegisterBitWidth(true) == 0) - return 1; - } + if (useHVX() && isTypeForHVX(VecTy)) { + unsigned RegWidth = getRegisterBitWidth(true); + Alignment = std::min(Alignment, RegWidth/8); + // Cost of HVX loads. + if (VecWidth % RegWidth == 0) + return VecWidth / RegWidth; + // Cost of constructing HVX vector from scalar loads. unsigned AlignWidth = 8 * std::max(1u, Alignment); unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; return 3*NumLoads; } + + // Non-HVX vectors. + // Add extra cost for floating point types. + unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? 4 : 1; + + Alignment = std::min(Alignment, 8u); + unsigned AlignWidth = 8 * std::max(1u, Alignment); + unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; + if (Alignment == 4 || Alignment == 8) + return Cost * NumLoads; + // Loads of less than 32 bits will need extra inserts to compose a vector. + unsigned LogA = Log2_32(Alignment); + return (3 - LogA) * Cost * NumLoads; } + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); } +unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, + Type *Src, unsigned Alignment, unsigned AddressSpace) { + return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); +} + +unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) { + return 1; +} + +unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, unsigned Alignment) { + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); +} + +unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, + Type *VecTy, unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace) { + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} + +unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy, const Instruction *I) { + if (ValTy->isVectorTy()) { + auto *VecTy = dyn_cast(ValTy); + std::pair LT = TLI.getTypeLegalizationCost(DL, ValTy); + if (Opcode == Instruction::FCmp) + return LT.first + 4 * VecTy->getNumElements(); + } + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); +} + +unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args); +} + +unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src, const Instruction *I) { + return 1; +} + +unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { + Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() + : Val; + if (Opcode == Instruction::InsertElement) { + // Need two rotations for non-zero index. + unsigned Cost = (Index != 0) ? 2 : 0; + if (ElemTy->isIntegerTy(32)) + return Cost; + // If it's not a 32-bit value, there will need to be an extract. + return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); + } + + if (Opcode == Instruction::ExtractElement) + return 2; + + return 1; +} + +/// --- Vector TTI end --- + unsigned HexagonTTIImpl::getPrefetchDistance() const { - return getST()->getL1PrefetchDistance(); + return ST.getL1PrefetchDistance(); } unsigned HexagonTTIImpl::getCacheLineSize() const { - return getST()->getL1CacheLineSize(); + return ST.getL1CacheLineSize(); } int HexagonTTIImpl::getUserCost(const User *U, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 61ffdf64f9f6..99de5c017e28 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -37,16 +37,19 @@ class HexagonTTIImpl : public BasicTTIImplBase { friend BaseT; - const HexagonSubtarget *ST; - const HexagonTargetLowering *TLI; + const HexagonSubtarget &ST; + const HexagonTargetLowering &TLI; - const HexagonSubtarget *getST() const { return ST; } - const HexagonTargetLowering *getTLI() const { return TLI; } + const HexagonSubtarget *getST() const { return &ST; } + const HexagonTargetLowering *getTLI() const { return &TLI; } + + bool useHVX() const; + bool isTypeForHVX(Type *VecTy) const; public: explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F) - : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), - TLI(ST->getTargetLowering()) {} + : BaseT(TM, F.getParent()->getDataLayout()), + ST(*TM->getSubtargetImpl(F)), TLI(*ST.getTargetLowering()) {} /// \name Scalar TTI Implementations /// @{ @@ -73,110 +76,59 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; - bool shouldMaximizeVectorBandwidth(bool OptSize) const { return true; } unsigned getMinimumVF(unsigned ElemWidth) const; + bool shouldMaximizeVectorBandwidth(bool OptSize) const { + return true; + } bool supportsEfficientVectorElementLoadStore() { return false; } - - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - return 0; - } - - unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) { - return 0; - } - - unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) { - return 1; - } - - unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF, unsigned VF) { - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); - } - unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX) { - return 1; - } - bool hasBranchDivergence() { return false; } - bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } - - unsigned getCFInstrCost(unsigned Opcode) { - return 1; - } - - unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *, - const SCEV *) { - return 0; + bool prefersVectorizedAddressing() { + return false; } + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + unsigned getOperandsScalarizationOverhead(ArrayRef Args, + unsigned VF); + unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); + unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE, + const SCEV *S); unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I = nullptr); - unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { - return 1; - } - + unsigned AddressSpace); unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { - return 1; - } - + Type *SubTp); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, - unsigned Alignment) { - return 1; - } - + bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); - } - - unsigned getNumberOfParts(Type *Tp) { - return BaseT::getNumberOfParts(Tp); - } - - bool prefersVectorizedAddressing() { - return true; - } - + unsigned Factor, ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - const Instruction *I) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); - } - + const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()) { - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args); - } - + ArrayRef Args = ArrayRef()); unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - const Instruction *I = nullptr) { - return 1; - } + const Instruction *I = nullptr); + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + unsigned getCFInstrCost(unsigned Opcode) { return 1; }