[LV] Improve register pressure estimate at high VFs

Previously, `getRegUsageForType` was implemented using
`getTypeLegalizationCost`.  `getRegUsageForType` is used by the loop
vectorizer to estimate the register pressure caused by using a vector
type.  However, `getTypeLegalizationCost` currently only appears to
understand splitting and not scalarization, so significantly
underestimates the register requirements.

Instead, use `getNumRegisters`, which understands when scalarization
can occur (via computeRegisterProperties).

This was discovered while investigating D118979 (Set maximum VF with
shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the
loop vectorizer previously ends up costing an v128i1 as 2 v64i*
registers where it actually occupies 128 i32 registers.

I'm sending this patch early for comment, I'm still doing some sanity checking
with LNT.  I note that getRegisterClassForType appears to return VectorRC even
though the type in question (large vNi1 types) end up occupying scalar
registers. That might be worth fixing too.

Differential Revision: https://reviews.llvm.org/D125918
This commit is contained in:
Peter Waller 2022-05-16 20:59:17 +00:00
parent 6ef5e242f2
commit ade47bdc31
9 changed files with 101 additions and 19 deletions

View File

@ -730,7 +730,7 @@ public:
bool isTypeLegal(Type *Ty) const;
/// Returns the estimated number of registers required to represent \p Ty.
InstructionCost getRegUsageForType(Type *Ty) const;
unsigned getRegUsageForType(Type *Ty) const;
/// Return true if switches should be turned into lookup tables for the
/// target.
@ -1593,7 +1593,7 @@ public:
virtual bool isProfitableToHoist(Instruction *I) = 0;
virtual bool useAA() = 0;
virtual bool isTypeLegal(Type *Ty) = 0;
virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
virtual unsigned getRegUsageForType(Type *Ty) = 0;
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool shouldBuildRelLookupTables() = 0;
@ -2032,7 +2032,7 @@ public:
}
bool useAA() override { return Impl.useAA(); }
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
InstructionCost getRegUsageForType(Type *Ty) override {
unsigned getRegUsageForType(Type *Ty) override {
return Impl.getRegUsageForType(Ty);
}
bool shouldBuildLookupTables() override {

View File

@ -312,7 +312,7 @@ public:
bool isTypeLegal(Type *Ty) const { return false; }
InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
unsigned getRegUsageForType(Type *Ty) const { return 1; }
bool shouldBuildLookupTables() const { return true; }

View File

@ -382,10 +382,9 @@ public:
return getTLI()->isTypeLegal(VT);
}
InstructionCost getRegUsageForType(Type *Ty) {
InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
assert(Val >= 0 && "Negative cost!");
return Val;
unsigned getRegUsageForType(Type *Ty) {
EVT ETy = getTLI()->getValueType(DL, Ty);
return getTLI()->getNumRegisters(Ty->getContext(), ETy);
}
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,

View File

@ -473,7 +473,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
return TTIImpl->isTypeLegal(Ty);
}
InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
return TTIImpl->getRegUsageForType(Ty);
}

View File

@ -429,7 +429,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
TypeSize Size = Ty->getPrimitiveSizeInBits();
if (Ty->isVectorTy()) {
if (Size.isScalable() && ST->hasVInstructions())

View File

@ -60,7 +60,7 @@ public:
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
InstructionCost getRegUsageForType(Type *Ty);
unsigned getRegUsageForType(Type *Ty);
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,

View File

@ -5987,16 +5987,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
// A lambda that gets the register usage for the given type and VF.
const auto &TTICapture = TTI;
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
InstructionCost::CostType RegUsage =
*TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
"Nonsensical values for register usage.");
return RegUsage;
return TTI.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {

View File

@ -0,0 +1,57 @@
; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
; REQUIRES: asserts
target triple = "aarch64"
; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
; CHECK: LV(REG): VF = 32
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
entry:
br label %loop
exit:
ret i1 %reduction_next
loop:
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
%reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
%loaded = load i32, ptr %gep
%i1 = icmp eq i32 %loaded, %induction
%reduction_next = or i1 %i1, %reduction
%induction_next = add nuw i32 %induction, 1
%cond = icmp eq i32 %induction_next, %arg
br i1 %cond, label %exit, label %loop, !llvm.loop !32
}
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
; CHECK: LV(REG): VF = 64
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
entry:
br label %loop
exit:
ret i1 %reduction_next
loop:
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
%reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
%loaded = load i32, ptr %gep
%i1 = icmp eq i32 %loaded, %induction
%reduction_next = or i1 %i1, %reduction
%induction_next = add nuw i32 %induction, 1
%cond = icmp eq i32 %induction_next, %arg
br i1 %cond, label %exit, label %loop, !llvm.loop !64
}
!32 = distinct !{!32, !33}
!33 = !{!"llvm.loop.vectorize.width", i32 32}
!64 = distinct !{!64, !65}
!65 = !{!"llvm.loop.vectorize.width", i32 64}

View File

@ -0,0 +1,32 @@
; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
; REQUIRES: asserts
target triple = "x86_64"
; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
; CHECK: LV(REG): VF = 64
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
entry:
br label %loop
exit:
ret i1 %reduction_next
loop:
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
%reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
%loaded = load i32, ptr %gep
%i1 = icmp eq i32 %loaded, %induction
%reduction_next = or i1 %i1, %reduction
%induction_next = add nuw i32 %induction, 1
%cond = icmp eq i32 %induction_next, %arg
br i1 %cond, label %exit, label %loop, !llvm.loop !64
}
!64 = distinct !{!64, !65}
!65 = !{!"llvm.loop.vectorize.width", i32 64}