forked from OSchip/llvm-project
AMDGPU: Propagate amdgpu-flat-work-group-size attributes
Fixes being overly conservative with the register counts in called functions. This should try to do a conservative range merge, but for now just clone. Also fix not being able to functionally run the pass standalone.
This commit is contained in:
parent
55dc123555
commit
53c43431bc
|
@ -32,6 +32,7 @@
|
|||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
@ -56,8 +57,10 @@ static constexpr const FeatureBitset TargetFeatures = {
|
|||
};
|
||||
|
||||
// Attributes to propagate.
|
||||
// TODO: Support conservative min/max merging instead of cloning.
|
||||
static constexpr const char* AttributeNames[] = {
|
||||
"amdgpu-waves-per-eu"
|
||||
"amdgpu-waves-per-eu",
|
||||
"amdgpu-flat-work-group-size"
|
||||
};
|
||||
|
||||
static constexpr unsigned NumAttr =
|
||||
|
@ -371,15 +374,28 @@ AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
|
|||
}
|
||||
|
||||
bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
|
||||
if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
|
||||
if (!TM) {
|
||||
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
||||
if (!TPC)
|
||||
return false;
|
||||
|
||||
TM = &TPC->getTM<TargetMachine>();
|
||||
}
|
||||
|
||||
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
|
||||
return false;
|
||||
|
||||
return AMDGPUPropagateAttributes(TM, false).process(F);
|
||||
}
|
||||
|
||||
bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
|
||||
if (!TM)
|
||||
return false;
|
||||
if (!TM) {
|
||||
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
||||
if (!TPC)
|
||||
return false;
|
||||
|
||||
TM = &TPC->getTM<TargetMachine>();
|
||||
}
|
||||
|
||||
return AMDGPUPropagateAttributes(TM, true).process(M);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
|
||||
|
||||
; CHECK: define internal void @max_flat_1_1024() #0 {
|
||||
define internal void @max_flat_1_1024() #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define internal void @max_flat_1_256() #1 {
|
||||
define internal void @max_flat_1_256() #1 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 {
|
||||
define amdgpu_kernel void @kernel_1_256_call_default() #1 {
|
||||
call void @default()
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
|
||||
define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
|
||||
call void @max_flat_1_256()
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
|
||||
define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
|
||||
call void @max_flat_64_64()
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define internal void @max_flat_64_64() #2 {
|
||||
define internal void @max_flat_64_64() #2 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define internal void @default() #2 {
|
||||
define internal void @default() #3 {
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" }
|
||||
attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" }
|
||||
attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" }
|
||||
attributes #3 = { noinline }
|
||||
|
||||
; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024"
|
||||
; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256"
|
||||
; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"
|
Loading…
Reference in New Issue