[Hexagon] Initial instruction cost model for auto-vectorization

llvm-svn: 330065
2018-04-13 20:46:50 +00:00 · 2018-04-13 20:46:50 +00:00 · 4bdf1aa416
parent 13e186c088
commit 4bdf1aa416
2 changed files with 195 additions and 98 deletions
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@ -16,6 +16,7 @@
 #include "HexagonTargetTransformInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/User.h"
@ -27,16 +28,35 @@ using namespace llvm;

 #define DEBUG_TYPE "hexagontti"

-static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
+static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(true),
  cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));

 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
  cl::init(true), cl::Hidden,
  cl::desc("Control lookup table emission on Hexagon target"));

+
+bool HexagonTTIImpl::useHVX() const {
+  return ST.useHVXOps() && HexagonAutoHVX;
+}
+
+bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
+  assert(VecTy->isVectorTy());
+  // Avoid types like <2 x i32*>.
+  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+    return false;
+  EVT VecVT = EVT::getEVT(VecTy);
+  if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
+    return false;
+  if (ST.isHVXVectorType(VecVT.getSimpleVT()))
+    return true;
+  auto Action = TLI.getPreferredVectorAction(VecVT);
+  return Action == TargetLoweringBase::TypeWidenVector;
+}
+
 TargetTransformInfo::PopcntSupportKind
 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
-  // Return Fast Hardware support as every input  < 64 bits will be promoted
+  // Return fast hardware support as every input < 64 bits will be promoted
  // to 64 bits.
  return TargetTransformInfo::PSK_FastHardware;
 }
@ -58,14 +78,16 @@ bool HexagonTTIImpl::shouldFavorPostInc() const {
  return true;
 }

+/// --- Vector TTI begin ---
+
 unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
  if (Vector)
-    return HexagonAutoHVX && getST()->useHVXOps() ? 32 : 0;
+    return useHVX() ? 32 : 0;
  return 32;
 }

 unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  return HexagonAutoHVX && getST()->useHVXOps() ? 64 : 0;
+  return useHVX() ? 2 : 0;
 }

 unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
@ -73,38 +95,161 @@ unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
 }

 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
-  return getST()->useHVXOps() ? getST()->getVectorLength()*8 : 0;
+  return useHVX() ? ST.getVectorLength()*8 : 0;
 }

 unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
-  return (8 * getST()->getVectorLength()) / ElemWidth;
+  return (8 * ST.getVectorLength()) / ElemWidth;
+}
+
+unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
+      bool Extract) {
+  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+}
+
+unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
+      ArrayRef<const Value*> Args, unsigned VF) {
+  return BaseT::getOperandsScalarizationOverhead(Args, VF);
+}
+
+unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
+      ArrayRef<Type*> Tys) {
+  return BaseT::getCallInstrCost(F, RetTy, Tys);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+}
+
+unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+      ArrayRef<Type*> Tys, FastMathFlags FMF,
+      unsigned ScalarizationCostPassed) {
+  if (ID == Intrinsic::bswap) {
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
+    return LT.first + 2;
+  }
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                      ScalarizationCostPassed);
+}
+
+unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
+      ScalarEvolution *SE, const SCEV *S) {
+  return 0;
 }

 unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
      unsigned Alignment, unsigned AddressSpace, const Instruction *I) {
-  if (Opcode == Instruction::Load && Src->isVectorTy()) {
+  assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
+  if (Opcode == Instruction::Store)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+
+  if (Src->isVectorTy()) {
    VectorType *VecTy = cast<VectorType>(Src);
    unsigned VecWidth = VecTy->getBitWidth();
-    if (VecWidth > 64) {
-      // Assume that vectors longer than 64 bits are meant for HVX.
-      if (getNumberOfRegisters(true) > 0) {
-        if (VecWidth % getRegisterBitWidth(true) == 0)
-          return 1;
-      }
+    if (useHVX() && isTypeForHVX(VecTy)) {
+      unsigned RegWidth = getRegisterBitWidth(true);
+      Alignment = std::min(Alignment, RegWidth/8);
+      // Cost of HVX loads.
+      if (VecWidth % RegWidth == 0)
+        return VecWidth / RegWidth;
+      // Cost of constructing HVX vector from scalar loads.
      unsigned AlignWidth = 8 * std::max(1u, Alignment);
      unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
      return 3*NumLoads;
    }
+
+    // Non-HVX vectors.
+    // Add extra cost for floating point types.
+    unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? 4 : 1;
+
+    Alignment = std::min(Alignment, 8u);
+    unsigned AlignWidth = 8 * std::max(1u, Alignment);
+    unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
+    if (Alignment == 4 || Alignment == 8)
+      return Cost * NumLoads;
+    // Loads of less than 32 bits will need extra inserts to compose a vector.
+    unsigned LogA = Log2_32(Alignment);
+    return (3 - LogA) * Cost * NumLoads;
  }
+
  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
 }

+unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode,
+      Type *Src, unsigned Alignment, unsigned AddressSpace) {
+  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
+      int Index, Type *SubTp) {
+  return 1;
+}
+
+unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+      Value *Ptr, bool VariableMask, unsigned Alignment) {
+  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                       Alignment);
+}
+
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
+      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      unsigned Alignment, unsigned AddressSpace) {
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
+unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+      Type *CondTy, const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    auto *VecTy = dyn_cast<VectorType>(ValTy);
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
+    if (Opcode == Instruction::FCmp)
+      return LT.first + 4 * VecTy->getNumElements();
+  }
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+}
+
+unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+      TTI::OperandValueProperties Opd1PropInfo,
+      TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) {
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+      Type *Src, const Instruction *I) {
+  return 1;
+}
+
+unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+      unsigned Index) {
+  Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
+                                   : Val;
+  if (Opcode == Instruction::InsertElement) {
+    // Need two rotations for non-zero index.
+    unsigned Cost = (Index != 0) ? 2 : 0;
+    if (ElemTy->isIntegerTy(32))
+      return Cost;
+    // If it's not a 32-bit value, there will need to be an extract.
+    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
+  }
+
+  if (Opcode == Instruction::ExtractElement)
+    return 2;
+
+  return 1;
+}
+
+/// --- Vector TTI end ---
+
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
-  return getST()->getL1PrefetchDistance();
+  return ST.getL1PrefetchDistance();
 }

 unsigned HexagonTTIImpl::getCacheLineSize() const {
-  return getST()->getL1CacheLineSize();
+  return ST.getL1CacheLineSize();
 }

 int HexagonTTIImpl::getUserCost(const User *U,
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -37,16 +37,19 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {

  friend BaseT;

-  const HexagonSubtarget *ST;
-  const HexagonTargetLowering *TLI;
+  const HexagonSubtarget &ST;
+  const HexagonTargetLowering &TLI;

-  const HexagonSubtarget *getST() const { return ST; }
-  const HexagonTargetLowering *getTLI() const { return TLI; }
+  const HexagonSubtarget *getST() const { return &ST; }
+  const HexagonTargetLowering *getTLI() const { return &TLI; }
+
+  bool useHVX() const;
+  bool isTypeForHVX(Type *VecTy) const;

 public:
  explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
-        TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()),
+        ST(*TM->getSubtargetImpl(F)), TLI(*ST.getTargetLowering()) {}

  /// \name Scalar TTI Implementations
  /// @{
@ -73,110 +76,59 @@ public:
  unsigned getMaxInterleaveFactor(unsigned VF);
  unsigned getRegisterBitWidth(bool Vector) const;
  unsigned getMinVectorRegisterBitWidth() const;
-  bool shouldMaximizeVectorBandwidth(bool OptSize) const { return true; }
  unsigned getMinimumVF(unsigned ElemWidth) const;

+  bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+    return true;
+  }
  bool supportsEfficientVectorElementLoadStore() {
    return false;
  }
-
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-    return 0;
-  }
-
-  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
-                                            unsigned VF) {
-    return 0;
-  }
-
-  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys) {
-    return 1;
-  }
-
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
-    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-  }
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Type*> Tys, FastMathFlags FMF,
-            unsigned ScalarizationCostPassed = UINT_MAX) {
-    return 1;
-  }
-
  bool hasBranchDivergence() {
    return false;
  }
-
  bool enableAggressiveInterleaving(bool LoopHasReductions) {
    return false;
  }
-
-  unsigned getCFInstrCost(unsigned Opcode) {
-    return 1;
-  }
-
-  unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *,
-                                     const SCEV *) {
-    return 0;
+  bool prefersVectorizedAddressing() {
+    return false;
  }

+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
+            unsigned VF);
+  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+            ArrayRef<Type*> Tys, FastMathFlags FMF,
+            unsigned ScalarizationCostPassed = UINT_MAX);
+  unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
+            const SCEV *S);
  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
            unsigned AddressSpace, const Instruction *I = nullptr);
-
  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                 unsigned AddressSpace) {
-    return 1;
-  }
-
+            unsigned AddressSpace);
  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                          Type *SubTp) {
-    return 1;
-  }
-
+            Type *SubTp);
  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                                  bool VariableMask,
-                                  unsigned Alignment) {
-    return 1;
-  }
-
+            bool VariableMask, unsigned Alignment);
  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                      unsigned Factor,
-                                      ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
-  }
-
-  unsigned getNumberOfParts(Type *Tp) {
-    return BaseT::getNumberOfParts(Tp);
-  }
-
-  bool prefersVectorizedAddressing() {
-    return true;
-  }
-
+            unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
+            unsigned AddressSpace);
  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                              const Instruction *I) {
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
-  }
-
+            const Instruction *I);
  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
            TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
            TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
            TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
            TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-            ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo, Args);
-  }
-
+            ArrayRef<const Value *> Args = ArrayRef<const Value *>());
  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                            const Instruction *I = nullptr) {
-    return 1;
-  }
+            const Instruction *I = nullptr);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  unsigned getCFInstrCost(unsigned Opcode) {
    return 1;
  }