[AMDGPU] Set amdgpu-memory-bound if a basic block has dense global memory access

AMDGPUPerfHintAnalysis doesn't set the memory bound attribute if
FuncInfo::InstCost outweighs MemInstCost even if we have a basic block
with relatively high global memory access. GCNSchedStrategy could revert
optimal scheduling in favour of occupancy which seems to degrade
performance for some kernels. This change introduces the
HasDenseGlobalMemAcc metric in the heuristic that makes the analysis
more conservative in these cases.

This fixes SWDEV-334259/SWDEV-343932

Differential Revision: https://reviews.llvm.org/D129759
This commit is contained in:
Abinav Puthan Purayil 2022-07-13 12:10:02 +05:30
parent 4baf8f092b
commit 9fa425c1ab
3 changed files with 45 additions and 2 deletions

View File

@ -116,6 +116,7 @@ private:
bool isGlobalAddr(const Value *V) const;
bool isLocalAddr(const Value *V) const;
bool isGlobalLoadUsedInBB(const Instruction &) const;
};
static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
return false;
}
// Returns true if the global load `I` is used in its own basic block.
bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
const auto *Ld = dyn_cast<LoadInst>(&I);
if (!Ld)
return false;
if (!isGlobalAddr(Ld->getPointerOperand()))
return false;
for (const User *Usr : Ld->users()) {
if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
if (UsrInst->getParent() == I.getParent())
return true;
}
}
return false;
}
AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
for (auto &B : F) {
LastAccess = MemAccessInfo();
unsigned UsedGlobalLoadsInBB = 0;
for (auto &I : B) {
if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
// TODO: Check if the global load and its user are close to each other
// instead (Or do this analysis in GCNSchedStrategy?).
if (isGlobalLoadUsedInBB(I))
UsedGlobalLoadsInBB += Size;
if (isIndirectAccess(&I))
FI.IAMInstCost += Size;
if (isLargeStride(&I))
@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
++FI.InstCost;
}
}
if (!FI.HasDenseGlobalMemAcc) {
unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
if (GlobalMemAccPercentage > 50) {
LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
<< B.getName() << " has " << GlobalMemAccPercentage
<< "% global memory access\n");
FI.HasDenseGlobalMemAcc = true;
}
}
}
return &FI;
@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
// Reverting optimal scheduling in favour of occupancy with basic block(s)
// having dense global memory access can potentially hurt performance.
if (FI.HasDenseGlobalMemAcc)
return true;
return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}

View File

@ -41,7 +41,11 @@ public:
unsigned InstCost;
unsigned IAMInstCost; // Indirect access memory instruction count
unsigned LSMInstCost; // Large stride memory instruction count
FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
// high global memory access
FuncInfo()
: MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
HasDenseGlobalMemAcc(false) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;

View File

@ -20,7 +20,7 @@ bb:
}
; GCN-LABEL: {{^}}test_membound_1:
; GCN: MemoryBound: 0
; GCN: MemoryBound: 1
define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0,
<2 x double> addrspace(1)* nocapture %ptr.1,
<2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {