llvm-project/llvm/test/CodeGen/AMDGPU/large-work-group-promote-al...

; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s

; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4

define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  store i32 4, i32* %arrayidx1, align 4
  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  store i32 5, i32* %arrayidx3, align 4
  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  %2 = load i32, i32* %arrayidx10, align 4
  store i32 %2, i32 addrspace(1)* %out, align 4
  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  %3 = load i32, i32* %arrayidx12
  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  store i32 %3, i32 addrspace(1)* %arrayidx13
  ret void
}

; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4

define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  store i32 4, i32* %arrayidx1, align 4
  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  store i32 5, i32* %arrayidx3, align 4
  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  %2 = load i32, i32* %arrayidx10, align 4
  store i32 %2, i32 addrspace(1)* %out, align 4
  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  %3 = load i32, i32* %arrayidx12
  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  store i32 %3, i32 addrspace(1)* %arrayidx13
  ret void
}

; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4

define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  store i32 4, i32* %arrayidx1, align 4
  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  store i32 5, i32* %arrayidx3, align 4
  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  %2 = load i32, i32* %arrayidx10, align 4
  store i32 %2, i32 addrspace(1)* %out, align 4
  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  %3 = load i32, i32* %arrayidx12
  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  store i32 %3, i32 addrspace(1)* %arrayidx13
  ret void
}

; CHECK: @occupancy_0(
; CHECK: alloca [5 x i32]
define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  store i32 4, i32* %arrayidx1, align 4
  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  store i32 5, i32* %arrayidx3, align 4
  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  %2 = load i32, i32* %arrayidx10, align 4
  store i32 %2, i32 addrspace(1)* %out, align 4
  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  %3 = load i32, i32* %arrayidx12
  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  store i32 %3, i32 addrspace(1)* %arrayidx13
  ret void
}

; CHECK: @occupancy_max(
; CHECK: alloca [5 x i32]
define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  store i32 4, i32* %arrayidx1, align 4
  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  store i32 5, i32* %arrayidx3, align 4
  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  %2 = load i32, i32* %arrayidx10, align 4
  store i32 %2, i32 addrspace(1)* %out, align 4
  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  %3 = load i32, i32* %arrayidx12
  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  store i32 %3, i32 addrspace(1)* %arrayidx13
  ret void
}

attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }
attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }
attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }
attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen llvm-svn: 266337 2016-04-15 00:27:07 +08:00			`; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck %s`

			`; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4`

			`define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {`
			`entry:`
			`%stack = alloca [5 x i32], align 4`
			`%0 = load i32, i32 addrspace(1)* %in, align 4`
			`%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0`
			`store i32 4, i32* %arrayidx1, align 4`
			`%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1`
			`%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4`
			`%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1`
			`store i32 5, i32* %arrayidx3, align 4`
			`%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0`
			`%2 = load i32, i32* %arrayidx10, align 4`
			`store i32 %2, i32 addrspace(1)* %out, align 4`
			`%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1`
			`%3 = load i32, i32* %arrayidx12`
			`%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1`
			`store i32 %3, i32 addrspace(1)* %arrayidx13`
			`ret void`
			`}`

			`; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4`

			`define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {`
			`entry:`
			`%stack = alloca [5 x i32], align 4`
			`%0 = load i32, i32 addrspace(1)* %in, align 4`
			`%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0`
			`store i32 4, i32* %arrayidx1, align 4`
			`%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1`
			`%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4`
			`%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1`
			`store i32 5, i32* %arrayidx3, align 4`
			`%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0`
			`%2 = load i32, i32* %arrayidx10, align 4`
			`store i32 %2, i32 addrspace(1)* %out, align 4`
			`%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1`
			`%3 = load i32, i32* %arrayidx12`
			`%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1`
			`store i32 %3, i32 addrspace(1)* %arrayidx13`
			`ret void`
			`}`

			`; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4`

			`define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {`
			`entry:`
			`%stack = alloca [5 x i32], align 4`
			`%0 = load i32, i32 addrspace(1)* %in, align 4`
			`%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0`
			`store i32 4, i32* %arrayidx1, align 4`
			`%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1`
			`%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4`
			`%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1`
			`store i32 5, i32* %arrayidx3, align 4`
			`%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0`
			`%2 = load i32, i32* %arrayidx10, align 4`
			`store i32 %2, i32 addrspace(1)* %out, align 4`
			`%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1`
			`%3 = load i32, i32* %arrayidx12`
			`%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1`
			`store i32 %3, i32 addrspace(1)* %arrayidx13`
			`ret void`
			`}`

AMDGPU: Fix promote alloca pass creating huge arrays This was assuming it could use all memory before, which is a bad decision because it restricts occupancy. By default, only try to use enough space that could reduce occupancy to 7, an arbitrarily chosen limit. Based on the exist LDS usage, try to round up to the limit in the current tier instead of further hurting occupancy. This isn't ideal, because it doesn't accurately know how much space is going to be used for alignment padding. llvm-svn: 269708 2016-05-17 05:19:59 +08:00			`; CHECK: @occupancy_0(`
			`; CHECK: alloca [5 x i32]`
			`define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {`
			`entry:`
			`%stack = alloca [5 x i32], align 4`
			`%0 = load i32, i32 addrspace(1)* %in, align 4`
			`%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0`
			`store i32 4, i32* %arrayidx1, align 4`
			`%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1`
			`%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4`
			`%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1`
			`store i32 5, i32* %arrayidx3, align 4`
			`%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0`
			`%2 = load i32, i32* %arrayidx10, align 4`
			`store i32 %2, i32 addrspace(1)* %out, align 4`
			`%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1`
			`%3 = load i32, i32* %arrayidx12`
			`%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1`
			`store i32 %3, i32 addrspace(1)* %arrayidx13`
			`ret void`
			`}`
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen llvm-svn: 266337 2016-04-15 00:27:07 +08:00
AMDGPU: Fix promote alloca pass creating huge arrays This was assuming it could use all memory before, which is a bad decision because it restricts occupancy. By default, only try to use enough space that could reduce occupancy to 7, an arbitrarily chosen limit. Based on the exist LDS usage, try to round up to the limit in the current tier instead of further hurting occupancy. This isn't ideal, because it doesn't accurately know how much space is going to be used for alignment padding. llvm-svn: 269708 2016-05-17 05:19:59 +08:00			`; CHECK: @occupancy_max(`
			`; CHECK: alloca [5 x i32]`
			`define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {`
			`entry:`
			`%stack = alloca [5 x i32], align 4`
			`%0 = load i32, i32 addrspace(1)* %in, align 4`
			`%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0`
			`store i32 4, i32* %arrayidx1, align 4`
			`%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1`
			`%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4`
			`%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1`
			`store i32 5, i32* %arrayidx3, align 4`
			`%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0`
			`%2 = load i32, i32* %arrayidx10, align 4`
			`store i32 %2, i32 addrspace(1)* %out, align 4`
			`%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1`
			`%3 = load i32, i32* %arrayidx12`
			`%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1`
			`store i32 %3, i32 addrspace(1)* %arrayidx13`
			`ret void`
			`}`

			`attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }`
			`attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }`
			`attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }`
			`attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }`
			`attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }`