[LV] Improve register pressure estimate at high VFs

Previously, `getRegUsageForType` was implemented using `getTypeLegalizationCost`. `getRegUsageForType` is used by the loop vectorizer to estimate the register pressure caused by using a vector type. However, `getTypeLegalizationCost` currently only appears to understand splitting and not scalarization, so significantly underestimates the register requirements. Instead, use `getNumRegisters`, which understands when scalarization can occur (via computeRegisterProperties). This was discovered while investigating D118979 (Set maximum VF with shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the loop vectorizer previously ends up costing an v128i1 as 2 v64i* registers where it actually occupies 128 i32 registers. I'm sending this patch early for comment, I'm still doing some sanity checking with LNT. I note that getRegisterClassForType appears to return VectorRC even though the type in question (large vNi1 types) end up occupying scalar registers. That might be worth fixing too. Differential Revision: https://reviews.llvm.org/D125918
2022-05-16 20:59:17 +00:00 · 2022-05-16 20:59:17 +00:00 · ade47bdc31
parent 6ef5e242f2
commit ade47bdc31
9 changed files with 101 additions and 19 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -730,7 +730,7 @@ public:
  bool isTypeLegal(Type *Ty) const;

  /// Returns the estimated number of registers required to represent \p Ty.
-  InstructionCost getRegUsageForType(Type *Ty) const;
+  unsigned getRegUsageForType(Type *Ty) const;

  /// Return true if switches should be turned into lookup tables for the
  /// target.
@ -1593,7 +1593,7 @@ public:
  virtual bool isProfitableToHoist(Instruction *I) = 0;
  virtual bool useAA() = 0;
  virtual bool isTypeLegal(Type *Ty) = 0;
-  virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
+  virtual unsigned getRegUsageForType(Type *Ty) = 0;
  virtual bool shouldBuildLookupTables() = 0;
  virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
  virtual bool shouldBuildRelLookupTables() = 0;
@ -2032,7 +2032,7 @@ public:
  }
  bool useAA() override { return Impl.useAA(); }
  bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
-  InstructionCost getRegUsageForType(Type *Ty) override {
+  unsigned getRegUsageForType(Type *Ty) override {
    return Impl.getRegUsageForType(Ty);
  }
  bool shouldBuildLookupTables() override {
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -312,7 +312,7 @@ public:

  bool isTypeLegal(Type *Ty) const { return false; }

-  InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
+  unsigned getRegUsageForType(Type *Ty) const { return 1; }

  bool shouldBuildLookupTables() const { return true; }

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -382,10 +382,9 @@ public:
    return getTLI()->isTypeLegal(VT);
  }

-  InstructionCost getRegUsageForType(Type *Ty) {
-    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
-    assert(Val >= 0 && "Negative cost!");
-    return Val;
+  unsigned getRegUsageForType(Type *Ty) {
+    EVT ETy = getTLI()->getValueType(DL, Ty);
+    return getTLI()->getNumRegisters(Ty->getContext(), ETy);
  }

  InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -473,7 +473,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
  return TTIImpl->isTypeLegal(Ty);
 }

-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
  return TTIImpl->getRegUsageForType(Ty);
 }

--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@ -429,7 +429,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  BaseT::getPeelingPreferences(L, SE, PP);
 }

-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
  TypeSize Size = Ty->getPrimitiveSizeInBits();
  if (Ty->isVectorTy()) {
    if (Size.isScalable() && ST->hasVInstructions())
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@ -60,7 +60,7 @@ public:

  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;

-  InstructionCost getRegUsageForType(Type *Ty);
+  unsigned getRegUsageForType(Type *Ty);

  InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                        Align Alignment, unsigned AddressSpace,
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -5987,16 +5987,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

-  // A lambda that gets the register usage for the given type and VF.
-  const auto &TTICapture = TTI;
-  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+  auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
      return 0;
-    InstructionCost::CostType RegUsage =
-        *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
-    assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
-           "Nonsensical values for register usage.");
-    return RegUsage;
+    return TTI.getRegUsageForType(VectorType::get(Ty, VF));
  };

  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
--- a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
@ -0,0 +1,57 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "aarch64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
+; CHECK: LV(REG): VF = 32
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !32
+}
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!32 = distinct !{!32, !33}
+!33 = !{!"llvm.loop.vectorize.width", i32 32}
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}
--- a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "x86_64"
+
+; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
+
+; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
+; CHECK: LV(REG): VF = 64
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+
+define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
+entry:
+  br label %loop
+exit:
+  ret i1 %reduction_next
+loop:
+  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
+  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
+  %loaded = load i32, ptr %gep
+  %i1 = icmp eq i32 %loaded, %induction
+  %reduction_next = or i1 %i1, %reduction
+  %induction_next = add nuw i32 %induction, 1
+  %cond = icmp eq i32 %induction_next, %arg
+  br i1 %cond, label %exit, label %loop, !llvm.loop !64
+}
+
+!64 = distinct !{!64, !65}
+!65 = !{!"llvm.loop.vectorize.width", i32 64}