forked from OSchip/llvm-project
[AMDGPU] Set amdgpu-memory-bound if a basic block has dense global memory access
AMDGPUPerfHintAnalysis doesn't set the memory bound attribute if FuncInfo::InstCost outweighs MemInstCost even if we have a basic block with relatively high global memory access. GCNSchedStrategy could revert optimal scheduling in favour of occupancy which seems to degrade performance for some kernels. This change introduces the HasDenseGlobalMemAcc metric in the heuristic that makes the analysis more conservative in these cases. This fixes SWDEV-334259/SWDEV-343932 Differential Revision: https://reviews.llvm.org/D129759
This commit is contained in:
parent
4baf8f092b
commit
9fa425c1ab
|
@ -116,6 +116,7 @@ private:
|
|||
|
||||
bool isGlobalAddr(const Value *V) const;
|
||||
bool isLocalAddr(const Value *V) const;
|
||||
bool isGlobalLoadUsedInBB(const Instruction &) const;
|
||||
};
|
||||
|
||||
static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
|
||||
|
@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the global load `I` is used in its own basic block.
|
||||
bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
|
||||
const auto *Ld = dyn_cast<LoadInst>(&I);
|
||||
if (!Ld)
|
||||
return false;
|
||||
if (!isGlobalAddr(Ld->getPointerOperand()))
|
||||
return false;
|
||||
|
||||
for (const User *Usr : Ld->users()) {
|
||||
if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
|
||||
if (UsrInst->getParent() == I.getParent())
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
|
||||
AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
|
||||
|
||||
|
@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
|
|||
|
||||
for (auto &B : F) {
|
||||
LastAccess = MemAccessInfo();
|
||||
unsigned UsedGlobalLoadsInBB = 0;
|
||||
for (auto &I : B) {
|
||||
if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
|
||||
unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
|
||||
// TODO: Check if the global load and its user are close to each other
|
||||
// instead (Or do this analysis in GCNSchedStrategy?).
|
||||
if (isGlobalLoadUsedInBB(I))
|
||||
UsedGlobalLoadsInBB += Size;
|
||||
if (isIndirectAccess(&I))
|
||||
FI.IAMInstCost += Size;
|
||||
if (isLargeStride(&I))
|
||||
|
@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
|
|||
++FI.InstCost;
|
||||
}
|
||||
}
|
||||
|
||||
if (!FI.HasDenseGlobalMemAcc) {
|
||||
unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
|
||||
if (GlobalMemAccPercentage > 50) {
|
||||
LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
|
||||
<< B.getName() << " has " << GlobalMemAccPercentage
|
||||
<< "% global memory access\n");
|
||||
FI.HasDenseGlobalMemAcc = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &FI;
|
||||
|
@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
|
|||
}
|
||||
|
||||
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
|
||||
// Reverting optimal scheduling in favour of occupancy with basic block(s)
|
||||
// having dense global memory access can potentially hurt performance.
|
||||
if (FI.HasDenseGlobalMemAcc)
|
||||
return true;
|
||||
|
||||
return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
|
||||
}
|
||||
|
||||
|
|
|
@ -41,7 +41,11 @@ public:
|
|||
unsigned InstCost;
|
||||
unsigned IAMInstCost; // Indirect access memory instruction count
|
||||
unsigned LSMInstCost; // Large stride memory instruction count
|
||||
FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
|
||||
bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
|
||||
// high global memory access
|
||||
FuncInfo()
|
||||
: MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
|
||||
HasDenseGlobalMemAcc(false) {}
|
||||
};
|
||||
|
||||
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
|
||||
|
|
|
@ -20,7 +20,7 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_membound_1:
|
||||
; GCN: MemoryBound: 0
|
||||
; GCN: MemoryBound: 1
|
||||
define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0,
|
||||
<2 x double> addrspace(1)* nocapture %ptr.1,
|
||||
<2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
|
||||
|
|
Loading…
Reference in New Issue