[AMDGPU] Control num waves per EU for implicit work-group size

Summary:
If amdgpu-flat-work-group-size is not specified in LLVM IR, the backend
uses default value of 1024. For this, minimum waves per EU should be 4.
However, backend is still setting minimum value to 1 instead of calculated
value. This is not observed normally as frontend always provide
amdgpu-flat-work-group-size attribute.

Reviewers: rampitec, b-sumner, sameerds, msearles

Reviewed By: rampitec

Subscribers: qcolombet, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D81991
This commit is contained in:
Pushpinder Singh 2020-06-17 00:06:48 -04:00
parent 7996a1ef70
commit e1a31f52cd
6 changed files with 35 additions and 5 deletions

View File

@ -410,10 +410,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// minimum/maximum flat work group sizes.
unsigned MinImpliedByFlatWorkGroupSize =
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
Default.first = MinImpliedByFlatWorkGroupSize;
bool RequestedFlatWorkGroupSize = false;
if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
Default.first = MinImpliedByFlatWorkGroupSize;
RequestedFlatWorkGroupSize = true;
}

View File

@ -4,9 +4,11 @@
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: ;;#ASMEND
define void @foo(i32 addrspace(5)* %ptr) {
define void @foo(i32 addrspace(5)* %ptr) #0 {
%tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
%tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
ret void
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,768" }

View File

@ -1,6 +1,25 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
--- |
define amdgpu_kernel void @a_to_v() #0 { ret void }
define amdgpu_kernel void @a4_to_v4() #0 { ret void }
define amdgpu_kernel void @a16_to_v16() #0 { ret void }
define amdgpu_kernel void @v_to_a() #0 { ret void }
define amdgpu_kernel void @v4_to_a4() #0 { ret void }
define amdgpu_kernel void @v16_to_a16() #0 { ret void }
define amdgpu_kernel void @s_to_a() #0 { ret void }
define amdgpu_kernel void @s2_to_a2() #0 { ret void }
define amdgpu_kernel void @a_to_a() #0 { ret void }
define amdgpu_kernel void @a2_to_a2() #0 { ret void }
define amdgpu_kernel void @a_to_a_spill() #0 { ret void }
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
...
---
name: a_to_v
tracksRegLiveness: true

View File

@ -47,4 +47,4 @@ define void @parent_func() #0 {
ret void
}
attributes #0 = { nounwind noinline norecurse }
attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }

View File

@ -199,7 +199,7 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
; GFX908: ScratchSize: 0
; GCN: VGPRBlocks: 63
; GCN: NumVGPRsForWavesPerEU: 256
define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@ -250,7 +250,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
; GFX908-FIXME: ScratchSize: 0
; GCN: VGPRBlocks: 63
; GCN: NumVGPRsForWavesPerEU: 256
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@ -288,3 +288,4 @@ st:
declare i32 @llvm.amdgcn.workitem.id.x()
attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

View File

@ -2,6 +2,14 @@
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
--- |
define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void }
define amdgpu_kernel void @max-counter-expcnt() #0 { ret void }
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
...
# Check that we handle cases where a counter has overflowed.
# Overflows lgkmcnt with gfx9 but not with gfx10.