forked from OSchip/llvm-project
[AMDGPU] Control num waves per EU for implicit work-group size
Summary: If amdgpu-flat-work-group-size is not specified in LLVM IR, the backend uses default value of 1024. For this, minimum waves per EU should be 4. However, backend is still setting minimum value to 1 instead of calculated value. This is not observed normally as frontend always provide amdgpu-flat-work-group-size attribute. Reviewers: rampitec, b-sumner, sameerds, msearles Reviewed By: rampitec Subscribers: qcolombet, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81991
This commit is contained in:
parent
7996a1ef70
commit
e1a31f52cd
|
@ -410,10 +410,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
|
||||||
// minimum/maximum flat work group sizes.
|
// minimum/maximum flat work group sizes.
|
||||||
unsigned MinImpliedByFlatWorkGroupSize =
|
unsigned MinImpliedByFlatWorkGroupSize =
|
||||||
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
|
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
|
||||||
|
Default.first = MinImpliedByFlatWorkGroupSize;
|
||||||
bool RequestedFlatWorkGroupSize = false;
|
bool RequestedFlatWorkGroupSize = false;
|
||||||
|
|
||||||
if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
|
if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
|
||||||
Default.first = MinImpliedByFlatWorkGroupSize;
|
|
||||||
RequestedFlatWorkGroupSize = true;
|
RequestedFlatWorkGroupSize = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,11 @@
|
||||||
; CHECK-NEXT: s_nop 0
|
; CHECK-NEXT: s_nop 0
|
||||||
; CHECK-NEXT: ;;#ASMEND
|
; CHECK-NEXT: ;;#ASMEND
|
||||||
|
|
||||||
define void @foo(i32 addrspace(5)* %ptr) {
|
define void @foo(i32 addrspace(5)* %ptr) #0 {
|
||||||
%tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
|
%tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
|
||||||
%tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
|
%tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
|
||||||
store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
|
store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,768" }
|
||||||
|
|
|
@ -1,6 +1,25 @@
|
||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
|
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
|
--- |
|
||||||
|
define amdgpu_kernel void @a_to_v() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @a4_to_v4() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @a16_to_v16() #0 { ret void }
|
||||||
|
|
||||||
|
define amdgpu_kernel void @v_to_a() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @v4_to_a4() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @v16_to_a16() #0 { ret void }
|
||||||
|
|
||||||
|
define amdgpu_kernel void @s_to_a() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @s2_to_a2() #0 { ret void }
|
||||||
|
|
||||||
|
define amdgpu_kernel void @a_to_a() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @a2_to_a2() #0 { ret void }
|
||||||
|
|
||||||
|
define amdgpu_kernel void @a_to_a_spill() #0 { ret void }
|
||||||
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
|
||||||
|
...
|
||||||
|
|
||||||
---
|
---
|
||||||
name: a_to_v
|
name: a_to_v
|
||||||
tracksRegLiveness: true
|
tracksRegLiveness: true
|
||||||
|
|
|
@ -47,4 +47,4 @@ define void @parent_func() #0 {
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
attributes #0 = { nounwind noinline norecurse }
|
attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }
|
||||||
|
|
|
@ -199,7 +199,7 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
|
||||||
; GFX908: ScratchSize: 0
|
; GFX908: ScratchSize: 0
|
||||||
; GCN: VGPRBlocks: 63
|
; GCN: VGPRBlocks: 63
|
||||||
; GCN: NumVGPRsForWavesPerEU: 256
|
; GCN: NumVGPRsForWavesPerEU: 256
|
||||||
define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
|
define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
|
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
|
||||||
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
|
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
|
||||||
|
@ -250,7 +250,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
|
||||||
; GFX908-FIXME: ScratchSize: 0
|
; GFX908-FIXME: ScratchSize: 0
|
||||||
; GCN: VGPRBlocks: 63
|
; GCN: VGPRBlocks: 63
|
||||||
; GCN: NumVGPRsForWavesPerEU: 256
|
; GCN: NumVGPRsForWavesPerEU: 256
|
||||||
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
|
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
|
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
|
||||||
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
|
%p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
|
||||||
|
@ -288,3 +288,4 @@ st:
|
||||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
|
||||||
attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
|
attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
|
||||||
|
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
|
||||||
|
|
|
@ -2,6 +2,14 @@
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
|
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
|
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
|
||||||
|
|
||||||
|
--- |
|
||||||
|
define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void }
|
||||||
|
define amdgpu_kernel void @max-counter-expcnt() #0 { ret void }
|
||||||
|
|
||||||
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
|
||||||
|
...
|
||||||
|
|
||||||
# Check that we handle cases where a counter has overflowed.
|
# Check that we handle cases where a counter has overflowed.
|
||||||
|
|
||||||
# Overflows lgkmcnt with gfx9 but not with gfx10.
|
# Overflows lgkmcnt with gfx9 but not with gfx10.
|
||||||
|
|
Loading…
Reference in New Issue