forked from OSchip/llvm-project
[LV] Improve register pressure estimate at high VFs
Previously, `getRegUsageForType` was implemented using `getTypeLegalizationCost`. `getRegUsageForType` is used by the loop vectorizer to estimate the register pressure caused by using a vector type. However, `getTypeLegalizationCost` currently only appears to understand splitting and not scalarization, so significantly underestimates the register requirements. Instead, use `getNumRegisters`, which understands when scalarization can occur (via computeRegisterProperties). This was discovered while investigating D118979 (Set maximum VF with shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the loop vectorizer previously ends up costing an v128i1 as 2 v64i* registers where it actually occupies 128 i32 registers. I'm sending this patch early for comment, I'm still doing some sanity checking with LNT. I note that getRegisterClassForType appears to return VectorRC even though the type in question (large vNi1 types) end up occupying scalar registers. That might be worth fixing too. Differential Revision: https://reviews.llvm.org/D125918
This commit is contained in:
parent
6ef5e242f2
commit
ade47bdc31
|
@ -730,7 +730,7 @@ public:
|
|||
bool isTypeLegal(Type *Ty) const;
|
||||
|
||||
/// Returns the estimated number of registers required to represent \p Ty.
|
||||
InstructionCost getRegUsageForType(Type *Ty) const;
|
||||
unsigned getRegUsageForType(Type *Ty) const;
|
||||
|
||||
/// Return true if switches should be turned into lookup tables for the
|
||||
/// target.
|
||||
|
@ -1593,7 +1593,7 @@ public:
|
|||
virtual bool isProfitableToHoist(Instruction *I) = 0;
|
||||
virtual bool useAA() = 0;
|
||||
virtual bool isTypeLegal(Type *Ty) = 0;
|
||||
virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
|
||||
virtual unsigned getRegUsageForType(Type *Ty) = 0;
|
||||
virtual bool shouldBuildLookupTables() = 0;
|
||||
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
|
||||
virtual bool shouldBuildRelLookupTables() = 0;
|
||||
|
@ -2032,7 +2032,7 @@ public:
|
|||
}
|
||||
bool useAA() override { return Impl.useAA(); }
|
||||
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
|
||||
InstructionCost getRegUsageForType(Type *Ty) override {
|
||||
unsigned getRegUsageForType(Type *Ty) override {
|
||||
return Impl.getRegUsageForType(Ty);
|
||||
}
|
||||
bool shouldBuildLookupTables() override {
|
||||
|
|
|
@ -312,7 +312,7 @@ public:
|
|||
|
||||
bool isTypeLegal(Type *Ty) const { return false; }
|
||||
|
||||
InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
|
||||
unsigned getRegUsageForType(Type *Ty) const { return 1; }
|
||||
|
||||
bool shouldBuildLookupTables() const { return true; }
|
||||
|
||||
|
|
|
@ -382,10 +382,9 @@ public:
|
|||
return getTLI()->isTypeLegal(VT);
|
||||
}
|
||||
|
||||
InstructionCost getRegUsageForType(Type *Ty) {
|
||||
InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
|
||||
assert(Val >= 0 && "Negative cost!");
|
||||
return Val;
|
||||
unsigned getRegUsageForType(Type *Ty) {
|
||||
EVT ETy = getTLI()->getValueType(DL, Ty);
|
||||
return getTLI()->getNumRegisters(Ty->getContext(), ETy);
|
||||
}
|
||||
|
||||
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
|
||||
|
|
|
@ -473,7 +473,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
|
|||
return TTIImpl->isTypeLegal(Ty);
|
||||
}
|
||||
|
||||
InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
|
||||
unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
|
||||
return TTIImpl->getRegUsageForType(Ty);
|
||||
}
|
||||
|
||||
|
|
|
@ -429,7 +429,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
|||
BaseT::getPeelingPreferences(L, SE, PP);
|
||||
}
|
||||
|
||||
InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
|
||||
unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
|
||||
TypeSize Size = Ty->getPrimitiveSizeInBits();
|
||||
if (Ty->isVectorTy()) {
|
||||
if (Size.isScalable() && ST->hasVInstructions())
|
||||
|
|
|
@ -60,7 +60,7 @@ public:
|
|||
|
||||
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
|
||||
|
||||
InstructionCost getRegUsageForType(Type *Ty);
|
||||
unsigned getRegUsageForType(Type *Ty);
|
||||
|
||||
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
Align Alignment, unsigned AddressSpace,
|
||||
|
|
|
@ -5987,16 +5987,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
|
|||
|
||||
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
|
||||
|
||||
// A lambda that gets the register usage for the given type and VF.
|
||||
const auto &TTICapture = TTI;
|
||||
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
|
||||
auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
|
||||
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
|
||||
return 0;
|
||||
InstructionCost::CostType RegUsage =
|
||||
*TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
|
||||
assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
|
||||
"Nonsensical values for register usage.");
|
||||
return RegUsage;
|
||||
return TTI.getRegUsageForType(VectorType::get(Ty, VF));
|
||||
};
|
||||
|
||||
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
target triple = "aarch64"
|
||||
|
||||
; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
|
||||
|
||||
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
|
||||
; CHECK: LV(REG): VF = 32
|
||||
; CHECK-NEXT: LV(REG): Found max usage: 2 item
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
|
||||
|
||||
define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
|
||||
entry:
|
||||
br label %loop
|
||||
exit:
|
||||
ret i1 %reduction_next
|
||||
loop:
|
||||
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
|
||||
%reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
|
||||
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
|
||||
%loaded = load i32, ptr %gep
|
||||
%i1 = icmp eq i32 %loaded, %induction
|
||||
%reduction_next = or i1 %i1, %reduction
|
||||
%induction_next = add nuw i32 %induction, 1
|
||||
%cond = icmp eq i32 %induction_next, %arg
|
||||
br i1 %cond, label %exit, label %loop, !llvm.loop !32
|
||||
}
|
||||
|
||||
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
|
||||
; CHECK: LV(REG): VF = 64
|
||||
; CHECK-NEXT: LV(REG): Found max usage: 2 item
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
|
||||
|
||||
define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
|
||||
entry:
|
||||
br label %loop
|
||||
exit:
|
||||
ret i1 %reduction_next
|
||||
loop:
|
||||
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
|
||||
%reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
|
||||
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
|
||||
%loaded = load i32, ptr %gep
|
||||
%i1 = icmp eq i32 %loaded, %induction
|
||||
%reduction_next = or i1 %i1, %reduction
|
||||
%induction_next = add nuw i32 %induction, 1
|
||||
%cond = icmp eq i32 %induction_next, %arg
|
||||
br i1 %cond, label %exit, label %loop, !llvm.loop !64
|
||||
}
|
||||
|
||||
!32 = distinct !{!32, !33}
|
||||
!33 = !{!"llvm.loop.vectorize.width", i32 32}
|
||||
!64 = distinct !{!64, !65}
|
||||
!65 = !{!"llvm.loop.vectorize.width", i32 64}
|
|
@ -0,0 +1,32 @@
|
|||
; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
target triple = "x86_64"
|
||||
|
||||
; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
|
||||
|
||||
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
|
||||
; CHECK: LV(REG): VF = 64
|
||||
; CHECK-NEXT: LV(REG): Found max usage: 2 item
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
|
||||
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
|
||||
|
||||
define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
|
||||
entry:
|
||||
br label %loop
|
||||
exit:
|
||||
ret i1 %reduction_next
|
||||
loop:
|
||||
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
|
||||
%reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
|
||||
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
|
||||
%loaded = load i32, ptr %gep
|
||||
%i1 = icmp eq i32 %loaded, %induction
|
||||
%reduction_next = or i1 %i1, %reduction
|
||||
%induction_next = add nuw i32 %induction, 1
|
||||
%cond = icmp eq i32 %induction_next, %arg
|
||||
br i1 %cond, label %exit, label %loop, !llvm.loop !64
|
||||
}
|
||||
|
||||
!64 = distinct !{!64, !65}
|
||||
!65 = !{!"llvm.loop.vectorize.width", i32 64}
|
Loading…
Reference in New Issue