AMDGPU: Propagate amdgpu-flat-work-group-size attributes

Fixes being overly conservative with the register counts in called
functions. This should try to do a conservative range merge, but for
now just clone.

Also fix not being able to functionally run the pass standalone.
This commit is contained in:
Matt Arsenault 2020-10-19 16:53:00 -04:00
parent 55dc123555
commit 53c43431bc
2 changed files with 68 additions and 4 deletions

View File

@ -32,6 +32,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Target/TargetMachine.h"
@ -56,8 +57,10 @@ static constexpr const FeatureBitset TargetFeatures = {
};
// Attributes to propagate.
// TODO: Support conservative min/max merging instead of cloning.
static constexpr const char* AttributeNames[] = {
"amdgpu-waves-per-eu"
"amdgpu-waves-per-eu",
"amdgpu-flat-work-group-size"
};
static constexpr unsigned NumAttr =
@ -371,15 +374,28 @@ AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
}
bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
if (!TM) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
TM = &TPC->getTM<TargetMachine>();
}
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
return false;
return AMDGPUPropagateAttributes(TM, false).process(F);
}
bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
if (!TM)
return false;
if (!TM) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
TM = &TPC->getTM<TargetMachine>();
}
return AMDGPUPropagateAttributes(TM, true).process(M);
}

View File

@ -0,0 +1,48 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; CHECK: define internal void @max_flat_1_1024() #0 {
define internal void @max_flat_1_1024() #0 {
ret void
}
; CHECK: define internal void @max_flat_1_256() #1 {
define internal void @max_flat_1_256() #1 {
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 {
define amdgpu_kernel void @kernel_1_256_call_default() #1 {
call void @default()
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
call void @max_flat_1_256()
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
call void @max_flat_64_64()
ret void
}
; CHECK: define internal void @max_flat_64_64() #2 {
define internal void @max_flat_64_64() #2 {
ret void
}
; CHECK: define internal void @default() #2 {
define internal void @default() #3 {
ret void
}
attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" }
attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" }
attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" }
attributes #3 = { noinline }
; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024"
; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256"
; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"