From 7ed3d81333b7a366d48a27521bb36ac58dc12fa2 Mon Sep 17 00:00:00 2001 From: Daniil Fukalov <1671137+dfukalov@users.noreply.github.com> Date: Thu, 18 Aug 2022 00:38:34 +0300 Subject: [PATCH] [NFCI] Move cost estimation from TargetLowering to TargetTransformInfo. TragetLowering had two last InstructionCost related `getTypeLegalizationCost()` and `getScalingFactorCost()` members, but all other costs are processed in TTI. E.g. it is not comfortable to use other TTI members in these two functions overrided in a target. Minor refactoring: `getTypeLegalizationCost()` now doesn't need DataLayout parameter - it was always passed from TTI. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D117723 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 78 ++++++++++------ llvm/include/llvm/CodeGen/TargetLowering.h | 39 ++++---- llvm/lib/CodeGen/TargetLoweringBase.cpp | 35 ------- .../Target/AArch64/AArch64ISelLowering.cpp | 16 ---- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 8 -- .../AArch64/AArch64TargetTransformInfo.cpp | 68 +++++++++----- .../AArch64/AArch64TargetTransformInfo.h | 9 ++ .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 22 ++++- .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 ---- llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 - llvm/lib/Target/ARM/ARMISelLowering.cpp | 12 --- llvm/lib/Target/ARM/ARMISelLowering.h | 8 -- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 61 ++++++++----- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 8 ++ .../Hexagon/HexagonTargetTransformInfo.cpp | 12 +-- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- .../Target/PowerPC/PPCTargetTransformInfo.cpp | 16 ++-- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 12 +-- llvm/lib/Target/X86/X86ISelLowering.cpp | 29 ------ llvm/lib/Target/X86/X86ISelLowering.h | 9 -- .../lib/Target/X86/X86TargetTransformInfo.cpp | 91 ++++++++++++------- llvm/lib/Target/X86/X86TargetTransformInfo.h | 9 ++ 23 files changed, 273 insertions(+), 292 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 4bc3e6f68650..2be86d040727 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -368,7 +368,9 @@ public: AM.BaseOffs = BaseOffset; AM.HasBaseReg = HasBaseReg; AM.Scale = Scale; - return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + return 0; + return -1; } bool isTruncateFree(Type *Ty1, Type *Ty2) { @@ -784,6 +786,41 @@ public: return Cost; } + /// Estimate the cost of type-legalization and the legalized type. + std::pair getTypeLegalizationCost(Type *Ty) const { + LLVMContext &C = Ty->getContext(); + EVT MTy = getTLI()->getValueType(DL, Ty); + + InstructionCost Cost = 1; + // We keep legalizing the type until we find a legal kind. We assume that + // the only operation that costs anything is the split. After splitting + // we need to handle two types. + while (true) { + TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); + + if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { + // Ensure we return a sensible simple VT here, since many callers of + // this function require it. + MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; + return std::make_pair(InstructionCost::getInvalid(), VT); + } + + if (LK.first == TargetLoweringBase::TypeLegal) + return std::make_pair(Cost, MTy.getSimpleVT()); + + if (LK.first == TargetLoweringBase::TypeSplitVector || + LK.first == TargetLoweringBase::TypeExpandInteger) + Cost *= 2; + + // Do not loop with f128 type. + if (MTy == LK.second) + return std::make_pair(Cost, MTy.getSimpleVT()); + + // Keep legalizing the type. + MTy = LK.second; + } + } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } InstructionCost getArithmeticInstrCost( @@ -806,7 +843,7 @@ public: Opd1PropInfo, Opd2PropInfo, Args, CxtI); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); bool IsFloat = Ty->isFPOrFPVectorTy(); // Assume that floating point arithmetic operations cost twice as much as @@ -940,10 +977,8 @@ public: const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair SrcLT = - TLI->getTypeLegalizationCost(DL, Src); - std::pair DstLT = - TLI->getTypeLegalizationCost(DL, Dst); + std::pair SrcLT = getTypeLegalizationCost(Src); + std::pair DstLT = getTypeLegalizationCost(Dst); TypeSize SrcSize = SrcLT.second.getSizeInBits(); TypeSize DstSize = DstLT.second.getSizeInBits(); @@ -1038,7 +1073,7 @@ public: // If we are legalizing by splitting, query the concrete TTI for the cost // of casting the original vector twice. We also need to factor in the // cost of the split itself. Count that as 1, to be consistent with - // TLI->getTypeLegalizationCost(). + // getTypeLegalizationCost(). bool SplitSrc = TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == TargetLowering::TypeSplitVector; @@ -1119,8 +1154,7 @@ public: if (CondTy->isVectorTy()) ISD = ISD::VSELECT; } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); if (!(ValTy->isVectorTy() && !LT.second.isVector()) && !TLI->isOperationExpand(ISD, LT.second)) { @@ -1153,10 +1187,7 @@ public: InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); - - return LT.first; + return getRegUsageForType(Val->getScalarType()); } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, @@ -1205,8 +1236,7 @@ public: // Assume types, such as structs, are expensive. if (getTLI()->getValueType(DL, Src, true) == MVT::Other) return 4; - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); // Assuming that all loads of legal types cost 1. InstructionCost Cost = LT.first; @@ -1286,7 +1316,7 @@ public: // Legalize the vector type, and get the legalized and unlegalized type // sizes. - MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT VecTyLT = getTypeLegalizationCost(VecTy).second; unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); unsigned VecTyLTSize = VecTyLT.getStoreSize(); @@ -1583,9 +1613,7 @@ public: // If we're not expanding the intrinsic then we assume this is cheap // to implement. if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, RetTy); - return LT.first; + return getTypeLegalizationCost(RetTy).first; } // Create the expanded types that will be used to calculate the uadd_sat @@ -2031,8 +2059,7 @@ public: } const TargetLoweringBase *TLI = getTLI(); - std::pair LT = - TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && @@ -2128,8 +2155,7 @@ public: } unsigned getNumberOfParts(Type *Tp) { - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); return LT.first.isValid() ? *LT.first.getValue() : 0; } @@ -2187,8 +2213,7 @@ public: unsigned NumReduxLevels = Log2_32(NumVecElts); InstructionCost ArithCost = 0; InstructionCost ShuffleCost = 0; - std::pair LT = - thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); + std::pair LT = thisT()->getTypeLegalizationCost(Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; @@ -2283,8 +2308,7 @@ public: } InstructionCost MinMaxCost = 0; InstructionCost ShuffleCost = 0; - std::pair LT = - thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); + std::pair LT = thisT()->getTypeLegalizationCost(Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 251fda1bae86..94fafcc11aaf 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -49,7 +49,6 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MachineValueType.h" #include #include @@ -964,6 +963,22 @@ public: return ValueTypeActions; } + /// Return pair that represents the legalization kind (first) that needs to + /// happen to EVT (second) in order to type-legalize it. + /// + /// First: how we should legalize values of this type, either it is already + /// legal (return 'Legal') or we need to promote it to a larger type (return + /// 'Promote'), or we need to expand it into multiple registers of smaller + /// integer type (return 'Expand'). 'Custom' is not an option. + /// + /// Second: for types supported by the target, this is an identity function. + /// For types that must be promoted to larger types, this returns the larger + /// type to promote to. For integer types that are larger than the largest + /// integer register, this contains one step in the expansion to get to the + /// smaller register. For illegal floating point types, this returns the + /// integer type to transform to. + LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; + /// Return how we should legalize values of this type, either it is already /// legal (return 'Legal') or we need to promote it to a larger type (return /// 'Promote'), or we need to expand it into multiple registers of smaller @@ -1905,10 +1920,6 @@ public: /// Get the ISD node that corresponds to the Instruction class opcode. int InstructionOpcodeToISD(unsigned Opcode) const; - /// Estimate the cost of type-legalization and the legalized type. - std::pair getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const; - /// @} //===--------------------------------------------------------------------===// @@ -2535,22 +2546,6 @@ public: Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; - /// Return the cost of the scaling factor used in the addressing mode - /// represented by AM for this target, for a load/store of the specified type. - /// - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - /// TODO: Handle pre/postinc as well. - /// TODO: Remove default argument - virtual InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS = 0) const { - // Default: assume that any scaling factor used in a legal AM is free. - if (isLegalAddressingMode(DL, AM, Ty, AS)) - return 0; - return -1; - } - /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. @@ -3257,8 +3252,6 @@ private: ValueTypeActionImpl ValueTypeActions; private: - LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; - /// Targets can specify ISD nodes that they would like PerformDAGCombine /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this /// array. diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 75ed8ba29a9a..4cf81cd7fd54 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1843,41 +1843,6 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { llvm_unreachable("Unknown instruction type encountered!"); } -std::pair -TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const { - LLVMContext &C = Ty->getContext(); - EVT MTy = getValueType(DL, Ty); - - InstructionCost Cost = 1; - // We keep legalizing the type until we find a legal kind. We assume that - // the only operation that costs anything is the split. After splitting - // we need to handle two types. - while (true) { - LegalizeKind LK = getTypeConversion(C, MTy); - - if (LK.first == TypeScalarizeScalableVector) { - // Ensure we return a sensible simple VT here, since many callers of this - // function require it. - MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; - return std::make_pair(InstructionCost::getInvalid(), VT); - } - - if (LK.first == TypeLegal) - return std::make_pair(Cost, MTy.getSimpleVT()); - - if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger) - Cost *= 2; - - // Do not loop with f128 type. - if (MTy == LK.second) - return std::make_pair(Cost, MTy.getSimpleVT()); - - // Keep legalizing the type. - MTy = LK.second; - } -} - Value * TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB, bool UseTLS) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8b5b5b2b8e7f..dda52248b6d7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13680,22 +13680,6 @@ bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { return true; } -InstructionCost AArch64TargetLowering::getScalingFactorCost( - const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { - // Scaling factors are not free at all. - // Operands | Rt Latency - // ------------------------------------------- - // Rt, [Xn, Xm] | 4 - // ------------------------------------------- - // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 - // Rt, [Xn, Wm, #imm] | - if (isLegalAddressingMode(DL, AM, Ty, AS)) - // Scale represents reg2 * scale, thus account for 1 if - // it is not equal to 0 or 1. - return AM.Scale != 0 && AM.Scale != 1; - return -1; -} - bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 29765a5cab21..aabb9abe5fdf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -634,14 +634,6 @@ public: unsigned AS, Instruction *I = nullptr) const override; - /// Return the cost of the scaling factor used in the addressing - /// mode represented by AM for this target, for a load/store - /// of the specified type. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - InstructionCost getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, - Type *Ty, unsigned AS) const override; - /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e55a7593fc8a..0b80750fa5c5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -309,7 +309,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::smax: { static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // v2i64 types get converted to cmp+bif hence the cost of 2 if (LT.second == MVT::v2i64) return LT.first * 2; @@ -324,7 +324,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // This is a base cost of 1 for the vadd, plus 3 extract shifts if we // need to extend the type, as it uses shr(qadd(shl, shl)). unsigned Instrs = @@ -337,14 +337,14 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) return LT.first; break; } case Intrinsic::experimental_stepvector: { InstructionCost Cost = 1; // Cost of the `index' instruction - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // Legalisation of illegal vectors involves an `index' instruction plus // (LT.first - 1) vector adds. if (LT.first > 1) { @@ -368,7 +368,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, {Intrinsic::bitreverse, MVT::v1i64, 2}, {Intrinsic::bitreverse, MVT::v2i64, 2}, }; - const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); + const auto LegalisationCost = getTypeLegalizationCost(RetTy); const auto *Entry = CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); if (Entry) { @@ -394,7 +394,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, {ISD::CTPOP, MVT::v8i8, 1}, {ISD::CTPOP, MVT::i32, 5}, }; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); MVT MTy = LT.second; if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { // Extra cost of +1 when illegal vector types are legalized by promoting @@ -451,7 +451,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.getArgTypes().empty()) break; bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; - auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); + auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); EVT MTy = TLI->getValueType(DL, RetTy); // Check for the legal types, which are where the size of the input and the // output are the same, or we are using cvt f64->i32 or f32->i64. @@ -1534,7 +1534,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // Legalize the destination type and ensure it can be used in a widening // operation. - auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + auto DstTyL = getTypeLegalizationCost(DstTy); unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) return false; @@ -1542,7 +1542,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // Legalize the source type and ensure it can be used in a widening // operation. auto *SrcTy = toVectorTy(Extend->getSrcTy()); - auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + auto SrcTyL = getTypeLegalizationCost(SrcTy); unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) return false; @@ -1899,7 +1899,7 @@ InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); // Legalize the types. - auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); + auto VecLT = getTypeLegalizationCost(VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -1954,7 +1954,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Index != -1U) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair LT = getTypeLegalizationCost(Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -1989,7 +1989,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( Opd2PropInfo, Args, CxtI); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { @@ -2150,7 +2150,7 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; - auto LT = TLI->getTypeLegalizationCost(DL, ValTy); + auto LT = getTypeLegalizationCost(ValTy); if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || (ST->hasFullFP16() && any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) @@ -2210,7 +2210,7 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, if (useNeonVector(Src)) return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); - auto LT = TLI->getTypeLegalizationCost(DL, Src); + auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -2235,7 +2235,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost( return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); auto *VT = cast(DataTy); - auto LT = TLI->getTypeLegalizationCost(DL, DataTy); + auto LT = getTypeLegalizationCost(DataTy); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -2272,7 +2272,7 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind); - auto LT = TLI->getTypeLegalizationCost(DL, Ty); + auto LT = getTypeLegalizationCost(Ty); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -2617,7 +2617,7 @@ InstructionCost AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); @@ -2641,7 +2641,7 @@ AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); InstructionCost LegalizationCost = 0; if (LT.first > 1) { Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); @@ -2690,7 +2690,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (isa(ValTy)) return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -2782,7 +2782,7 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { { TTI::SK_Splice, MVT::nxv2f64, 1 }, }; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; EVT PromotedVT = LT.second.getScalarType() == MVT::i1 @@ -2819,7 +2819,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); // If we have a Mask, and the LT is being legalized somehow, split the Mask // into smaller vectors and sum the cost of each shuffle. if (!Mask.empty() && isa(Tp) && LT.second.isVector() && @@ -3016,8 +3016,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // move, so long as the inserted vector is "aligned". if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && LT.second.getSizeInBits() <= 128 && SubTp) { - std::pair SubLT = - TLI->getTypeLegalizationCost(DL, SubTp); + std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumElts = LT.second.getVectorNumElements(); int NumSubElts = SubLT.second.getVectorNumElements(); @@ -3052,3 +3051,26 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue( return (TailFoldingKindLoc & Required) == Required; } + +InstructionCost +AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const { + // Scaling factors are not free at all. + // Operands | Rt Latency + // ------------------------------------------- + // Rt, [Xn, Xm] | 4 + // ------------------------------------------- + // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 + // Rt, [Xn, Wm, #imm] | + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + // Scale represents reg2 * scale, thus account for 1 if + // it is not equal to 0 or 1. + return AM.Scale != 0 && AM.Scale != 1; + return -1; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index acd243d3f173..67094287ed38 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -371,6 +371,15 @@ public: ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args = None); + + /// Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; /// @} }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 8feff8fbb9bb..f853181c9bf2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -518,7 +518,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( const Instruction *CxtI) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); // Because we don't have any legal vector operations, but the legal types, we @@ -690,7 +690,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Type *RetTy = ICA.getReturnType(); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; @@ -769,7 +769,7 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); return LT.first * getFullRateInstrCost(); } @@ -784,7 +784,7 @@ GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); return LT.first * getHalfRateInstrCost(CostKind); } @@ -1148,3 +1148,17 @@ int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); } + +std::pair +GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { + std::pair Cost = BaseT::getTypeLegalizationCost(Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first += (Size + 255) / 256; + return Cost; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index eeb304311342..4a8b2eeebb6a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -94,6 +94,8 @@ class GCNTTIImpl final : public BasicTTIImplBase { // quarter. This also applies to some integer operations. int get64BitInstrCost(TTI::TargetCostKind CostKind) const; + std::pair getTypeLegalizationCost(Type *Ty) const; + public: explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2dd0da5cd56e..fe5ceb277811 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12947,22 +12947,6 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); } -std::pair -SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const { - std::pair Cost = - TargetLoweringBase::getTypeLegalizationCost(DL, Ty); - auto Size = DL.getTypeSizeInBits(Ty); - // Maximum load or store can handle 8 dwords for scalar and 4 for - // vector ALU. Let's assume anything above 8 dwords is expensive - // even if legal. - if (Size <= 256) - return Cost; - - Cost.first += (Size + 255) / 256; - return Cost; -} - bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { SDNode::use_iterator I = N->use_begin(), E = N->use_end(); for (; I != E; ++I) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f7ea96553e71..81bd8dedc303 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -525,9 +525,6 @@ public: const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; - std::pair getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const; - MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override; }; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 85b0bc80b22d..2306193f0429 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19147,18 +19147,6 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { return true; } -InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, - Type *Ty, - unsigned AS) const { - if (isLegalAddressingMode(DL, AM, Ty, AS)) { - if (Subtarget->hasFPAO()) - return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster - return 0; - } - return -1; -} - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index fae279ea7569..8947c4add327 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -470,14 +470,6 @@ class VectorType; Type *Ty, unsigned AS, Instruction *I = nullptr) const override; - /// getScalingFactorCost - Return the cost of the scaling used in - /// addressing mode represented by AM. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, the return value must be negative. - InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS) const override; - bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// Returns true if the addressing mode representing by AM is legal diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index c97b3e7862b9..11f3dd3094b6 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -634,7 +634,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_EXTEND, MVT::v2f32, 2}, {ISD::FP_EXTEND, MVT::v4f32, 4}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) return AdjustCost(LT.first * Entry->Cost); } @@ -901,7 +901,7 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, // sometimes just be vmovs. Integer involve being passes to GPR registers, // causing more of a delay. std::pair LT = - getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); + getTypeLegalizationCost(ValTy->getScalarType()); return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } @@ -926,7 +926,7 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, // - may require one or more conditional mov (including an IT), // - can't operate directly on immediates, // - require live flags, which we can't copy around easily. - InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; + InstructionCost Cost = getTypeLegalizationCost(ValTy).first; // Possible IT instruction for Thumb2, or more for Thumb1. ++Cost; @@ -1003,8 +1003,7 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); return LT.first; } @@ -1028,8 +1027,7 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, I); } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); int BaseCost = ST->getMVEVectorCostFactor(CostKind); // There are two types - the input that specifies the type of the compare // and the output vXi1 type. Because we don't know how the output will be @@ -1222,7 +1220,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1243,7 +1241,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1267,7 +1265,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1283,7 +1281,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost * @@ -1291,7 +1289,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, } if (!Mask.empty()) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (LT.second.isVector() && Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || @@ -1328,7 +1326,7 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost( } } - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (ST->hasNEON()) { const unsigned FunctionCallDivCost = 20; @@ -1467,7 +1465,7 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, cast(Src)->getElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); return LT.first * 4; } @@ -1568,7 +1566,7 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost( unsigned NumElems = VTy->getNumElements(); unsigned EltSize = VTy->getScalarSizeInBits(); - std::pair LT = TLI->getTypeLegalizationCost(DL, DataTy); + std::pair LT = getTypeLegalizationCost(DataTy); // For now, it is assumed that for the MVE gather instructions the loads are // all effectively serialised. This means the cost is the scalar cost @@ -1664,7 +1662,7 @@ ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); static const CostTblEntry CostTblAdd[]{ {ISD::ADD, MVT::v16i8, 1}, @@ -1688,8 +1686,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost( switch (ISD) { case ISD::ADD: if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); // The legal cases are: // VADDV u/s 8/16/32 @@ -1720,8 +1717,7 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, EVT ResVT = TLI->getValueType(DL, ResTy); if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); // The legal cases are: // VMLAV u/s 8/16/32 @@ -1763,7 +1759,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) { // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we @@ -1783,7 +1779,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) return LT.first * ST->getMVEVectorCostFactor(CostKind); @@ -1794,7 +1790,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (!ST->hasMVEFloatOps()) break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) return LT.first * ST->getMVEVectorCostFactor(CostKind); break; @@ -1804,7 +1800,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.getArgTypes().empty()) break; bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; - auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); + auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); EVT MTy = TLI->getValueType(DL, ICA.getReturnType()); // Check for the legal types, with the corect subtarget features. if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) || @@ -2416,3 +2412,20 @@ bool ARMTTIImpl::preferPredicatedReductionSelect( return false; return true; } + +InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) { + if (ST->hasFPAO()) + return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster + return 0; + } + return -1; +} diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index a3aed48f6beb..72d28acf6d4a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -287,6 +287,14 @@ public: InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); + /// getScalingFactorCost - Return the cost of the scaling used in + /// addressing mode represented by AM. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, the return value must be negative. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; + bool maybeLoweredToCall(Instruction &I); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index bb0aaa3150fb..e11bef95e14e 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -145,7 +145,7 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { if (ICA.getID() == Intrinsic::bswap) { std::pair LT = - TLI.getTypeLegalizationCost(DL, ICA.getReturnType()); + getTypeLegalizationCost(ICA.getReturnType()); return LT.first + 2; } return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -254,7 +254,7 @@ InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, const Instruction *I) { if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { - std::pair LT = TLI.getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); if (Opcode == Instruction::FCmp) return LT.first + FloatFactor * getTypeNumElements(ValTy); } @@ -274,7 +274,7 @@ InstructionCost HexagonTTIImpl::getArithmeticInstrCost( Opd2PropInfo, Args, CxtI); if (Ty->isVectorTy()) { - std::pair LT = TLI.getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (LT.second.isFloatingPoint()) return LT.first + FloatFactor * getTypeNumElements(Ty); } @@ -291,10 +291,8 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; - std::pair SrcLT = - TLI.getTypeLegalizationCost(DL, SrcTy); - std::pair DstLT = - TLI.getTypeLegalizationCost(DL, DstTy); + std::pair SrcLT = getTypeLegalizationCost(SrcTy); + std::pair DstLT = getTypeLegalizationCost(DstTy); InstructionCost Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); // TODO: Allow non-throughput costs that aren't binary. diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index fc4bc6b3cbf7..7b4ab9134eb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -428,7 +428,7 @@ InstructionCost NVPTXTTIImpl::getArithmeticInstrCost( TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 6a86cfa50121..27ba6d780dd7 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -331,8 +331,7 @@ InstructionCost PPCTTIImpl::getUserCost(const User *U, if (U->getType()->isVectorTy()) { // Instructions that need to be split should cost more. - std::pair LT = - TLI->getTypeLegalizationCost(DL, U->getType()); + std::pair LT = getTypeLegalizationCost(U->getType()); return LT.first * BaseT::getUserCost(U, Operands, CostKind); } @@ -960,7 +959,7 @@ InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode, if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) return InstructionCost(1); - std::pair LT1 = TLI->getTypeLegalizationCost(DL, Ty1); + std::pair LT1 = getTypeLegalizationCost(Ty1); // If type legalization involves splitting the vector, we don't want to // double the cost at every step - only the last step. if (LT1.first != 1 || !LT1.second.isVector()) @@ -971,7 +970,7 @@ InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode, return InstructionCost(1); if (Ty2) { - std::pair LT2 = TLI->getTypeLegalizationCost(DL, Ty2); + std::pair LT2 = getTypeLegalizationCost(Ty2); if (LT2.first != 1 || !LT2.second.isVector()) return InstructionCost(1); } @@ -1014,7 +1013,7 @@ InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, return InstructionCost::getMax(); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant @@ -1156,7 +1155,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); @@ -1246,7 +1245,7 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost( "Expect a vector type for interleaved memory op"); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); + std::pair LT = getTypeLegalizationCost(VecTy); // Firstly, the cost of load/store operation. InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), @@ -1427,8 +1426,7 @@ InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src, assert(SrcVTy && "Expected a vector type for VP memory operations"); if (hasActiveVectorLength(Opcode, Src, Alignment)) { - std::pair LT = - TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair LT = getTypeLegalizationCost(SrcVTy); InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 914631a023c3..cd414a761121 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -168,7 +168,7 @@ RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { } InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); unsigned Cost = 2; // vslidedown+vslideup. // TODO: LMUL should increase cost. @@ -182,7 +182,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, int Index, VectorType *SubTp, ArrayRef Args) { if (isa(Tp)) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); switch (Kind) { default: // Fallthrough to generic handling. @@ -257,7 +257,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // TODO: add more intrinsic case Intrinsic::experimental_stepvector: { unsigned Cost = 1; // vid - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); return Cost + (LT.first - 1); } default: @@ -364,7 +364,7 @@ RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, if (Ty->getScalarSizeInBits() > ST->getELEN()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (Ty->getElementType()->isIntegerTy(1)) // vcpop sequences, see vreduction-mask.ll. umax, smin actually only // cost 2, but we don't have enough info here so we slightly over cost. @@ -394,7 +394,7 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, ISD != ISD::FADD) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (Ty->getElementType()->isIntegerTy(1)) // vcpop sequences, see vreduction-mask.ll return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2); @@ -423,7 +423,7 @@ InstructionCost RISCVTTIImpl::getExtendedReductionCost( return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6f9491ef2683..7eee6f114aee 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56478,35 +56478,6 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return Res; } -InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, - Type *Ty, - unsigned AS) const { - // Scaling factors are not free at all. - // An indexed folded instruction, i.e., inst (reg1, reg2, scale), - // will take 2 allocations in the out of order engine instead of 1 - // for plain addressing mode, i.e. inst (reg1). - // E.g., - // vaddps (%rsi,%rdx), %ymm0, %ymm1 - // Requires two allocations (one for the load, one for the computation) - // whereas: - // vaddps (%rsi), %ymm0, %ymm1 - // Requires just 1 allocation, i.e., freeing allocations for other operations - // and having less micro operations to execute. - // - // For some X86 architectures, this is even worse because for instance for - // stores, the complex addressing mode forces the instruction to use the - // "load" ports instead of the dedicated "store" port. - // E.g., on Haswell: - // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. - // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. - if (isLegalAddressingMode(DL, AM, Ty, AS)) - // Scale represents reg2 * scale, thus account for 1 - // as soon as we use a second register. - return AM.Scale != 0; - return -1; -} - bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 18fb2dbe8d71..b5cfcda519de 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1240,15 +1240,6 @@ namespace llvm { bool isLegalStoreImmediate(int64_t Imm) const override; - /// Return the cost of the scaling factor used in the addressing - /// mode represented by AM for this target, for a load/store - /// of the specified type. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS) const override; - /// This is used to enable splatted operand transforms for vector shifts /// and vector funnel shifts. bool isVectorShiftByScalarCheap(Type *Ty) const override; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 14cd86cb9c94..d409885a9131 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -202,7 +202,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( } // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -1089,7 +1089,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, ArrayRef Args) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. - std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); + std::pair LT = getTypeLegalizationCost(BaseTp); Kind = improveShuffleKindFromMask(Kind, Mask); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -1108,8 +1108,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, int NumElts = LT.second.getVectorNumElements(); if ((Index % NumElts) == 0) return 0; - std::pair SubLT = - TLI->getTypeLegalizationCost(DL, SubTp); + std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) @@ -1155,8 +1154,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // isn't free, because we need to preserve the rest of the wide vector. if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { int NumElts = LT.second.getVectorNumElements(); - std::pair SubLT = - TLI->getTypeLegalizationCost(DL, SubTp); + std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) @@ -2528,9 +2526,8 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, } // Fall back to legalized types. - std::pair LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair LTDest = - TLI->getTypeLegalizationCost(DL, Dst); + std::pair LTSrc = getTypeLegalizationCost(Src); + std::pair LTDest = getTypeLegalizationCost(Dst); // If we're truncating to the same legalized type - just assume its free. if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) @@ -2630,7 +2627,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, I); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -3395,7 +3392,7 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, OpTy); + std::pair LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -3629,8 +3626,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair LT = - TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -3709,7 +3705,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return 1; // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair LT = getTypeLegalizationCost(Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -3797,7 +3793,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, cast(Ty)->getNumElements() && "Vector size mismatch"); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); MVT MScalarTy = LT.second.getScalarType(); unsigned SizeInBits = LT.second.getSizeInBits(); @@ -3987,10 +3983,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); // Legalize the types. - MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; - MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; - MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; - MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; + MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; + MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; + MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; + MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; // They should have legalized into vector types. if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) @@ -4064,7 +4060,7 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, CostKind); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); auto *VTy = dyn_cast(Src); @@ -4227,7 +4223,7 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, } // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair LT = getTypeLegalizationCost(SrcVTy); auto VT = TLI->getValueType(DL, SrcVTy); InstructionCost Cost = 0; if (VT.isSimple() && LT.second != VT.getSimpleVT() && @@ -4343,7 +4339,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, return Entry->Cost; } - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -4531,7 +4527,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); MVT MTy = LT.second; @@ -4661,7 +4657,7 @@ InstructionCost X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -5088,10 +5084,8 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, auto *IndexVTy = FixedVectorType::get( IntegerType::get(SrcVTy->getContext(), IndexSize), VF); - std::pair IdxsLT = - TLI->getTypeLegalizationCost(DL, IndexVTy); - std::pair SrcLT = - TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair IdxsLT = getTypeLegalizationCost(IndexVTy); + std::pair SrcLT = getTypeLegalizationCost(SrcVTy); InstructionCost::CostType SplitFactor = *std::max(IdxsLT.first, SrcLT.first).getValue(); if (SplitFactor > 1) { @@ -5533,7 +5527,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( // Calculate the number of memory operations (NumOfMemOps), required // for load/store the VecTy. - MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT LegalVT = getTypeLegalizationCost(VecTy).second; unsigned VecTySize = DL.getTypeStoreSize(VecTy); unsigned LegalVTSize = LegalVT.getStoreSize(); unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; @@ -5613,8 +5607,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), VecTy->getNumElements() / Factor); InstructionCost NumOfResults = - getTLI()->getTypeLegalizationCost(DL, ResultTy).first * - NumOfLoadsInInterleaveGrp; + getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; // About a half of the loads may be folded in shuffles when we have only // one result. If we have more than one result, or the loads are masked, @@ -5711,7 +5704,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have // VecTy = <12 x i32>. - MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT LegalVT = getTypeLegalizationCost(VecTy).second; // This function can be called with VecTy=<6xi128>, Factor=3, in which case // the VF=2, while v2i128 is an unsupported MVT vector type @@ -5989,3 +5982,37 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } + +InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%rdx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 9f83c0461b56..f5f6097111e9 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -226,6 +226,15 @@ public: InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); + /// Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp();