llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_bu...

; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)

; Show that what the atomic optimization pass will do for raw buffers.

; GCN-LABEL: add_i32_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: add_i32_uniform:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
entry:
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: add_i32_varying_vdata:
; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %lane = call i32 @llvm.amdgcn.workitem.id.x()
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: add_i32_varying_offset:
; GCN-NOT: v_mbcnt_lo_u32_b32
; GCN-NOT: v_mbcnt_hi_u32_b32
; GCN-NOT: s_bcnt1_i32_b64
; GCN: buffer_atomic_add v{{[0-9]+}}
define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %lane = call i32 @llvm.amdgcn.workitem.id.x()
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: sub_i32_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: sub_i32_uniform:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GCN: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
entry:
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: sub_i32_varying_vdata:
; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %lane = call i32 @llvm.amdgcn.workitem.id.x()
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}

; GCN-LABEL: sub_i32_varying_offset:
; GCN-NOT: v_mbcnt_lo_u32_b32
; GCN-NOT: v_mbcnt_hi_u32_b32
; GCN-NOT: s_bcnt1_i32_b64
; GCN: buffer_atomic_sub v{{[0-9]+}}
define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
entry:
  %lane = call i32 @llvm.amdgcn.workitem.id.x()
  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
  store i32 %old, i32 addrspace(1)* %out
  ret void
}
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s`
			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s`
			`; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s`

			`declare i32 @llvm.amdgcn.workitem.id.x()`
			`declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)`
			`declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)`

			`; Show that what the atomic optimization pass will do for raw buffers.`

			`; GCN-LABEL: add_i32_constant:`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
			`; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}`
			`; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5`
			`; GCN: buffer_atomic_add v[[value]]`
			`define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: add_i32_uniform:`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
			`; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}`
			`; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]`
			`; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]`
			`; GCN: buffer_atomic_add v[[value]]`
			`define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {`
			`entry:`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: add_i32_varying_vdata:`
			`; GFX7LESS-NOT: v_mbcnt_lo_u32_b32`
			`; GFX7LESS-NOT: v_mbcnt_hi_u32_b32`
			`; GFX7LESS-NOT: s_bcnt1_i32_b64`
			`; GFX7LESS: buffer_atomic_add v{{[0-9]+}}`
[AMDGPU] Fix DPP combiner check for exec modification Summary: r363675 changed the exec modification helper function, now called execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks all instructions in the basic block, even beyond the last use. That meant that the DPP combiner no longer worked in any basic block that ended with a control flow instruction, and in particular it didn't work on code sequences generated by the atomic optimizer. Fix it by reinstating the old behaviour but in a new helper function execMayBeModifiedBeforeAnyUse, and limiting the number of instructions scanned. Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64393 llvm-svn: 365910 2019-07-12 23:59:40 +08:00			`; GFX8MORE: v_add_u32_dpp`
			`; GFX8MORE: v_add_u32_dpp`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63`
			`; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]`
			`; GFX8MORE: buffer_atomic_add v[[value]]`
			`define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%lane = call i32 @llvm.amdgcn.workitem.id.x()`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: add_i32_varying_offset:`
			`; GCN-NOT: v_mbcnt_lo_u32_b32`
			`; GCN-NOT: v_mbcnt_hi_u32_b32`
			`; GCN-NOT: s_bcnt1_i32_b64`
			`; GCN: buffer_atomic_add v{{[0-9]+}}`
			`define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%lane = call i32 @llvm.amdgcn.workitem.id.x()`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: sub_i32_constant:`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
			`; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}`
			`; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5`
			`; GCN: buffer_atomic_sub v[[value]]`
			`define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: sub_i32_uniform:`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
			`; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}`
			`; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]`
			`; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]`
			`; GCN: buffer_atomic_sub v[[value]]`
			`define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {`
			`entry:`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: sub_i32_varying_vdata:`
			`; GFX7LESS-NOT: v_mbcnt_lo_u32_b32`
			`; GFX7LESS-NOT: v_mbcnt_hi_u32_b32`
			`; GFX7LESS-NOT: s_bcnt1_i32_b64`
			`; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}`
[AMDGPU] Fix DPP combiner check for exec modification Summary: r363675 changed the exec modification helper function, now called execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks all instructions in the basic block, even beyond the last use. That meant that the DPP combiner no longer worked in any basic block that ended with a control flow instruction, and in particular it didn't work on code sequences generated by the atomic optimizer. Fix it by reinstating the old behaviour but in a new helper function execMayBeModifiedBeforeAnyUse, and limiting the number of instructions scanned. Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64393 llvm-svn: 365910 2019-07-12 23:59:40 +08:00			`; GFX8MORE: v_sub{{(rev)?}}_u32_dpp`
			`; GFX8MORE: v_sub{{(rev)?}}_u32_dpp`
[AMDGPU] Pre-allocate WWM registers to reduce VGPR pressure. This change incorporates an effort by Connor Abbot to change how we deal with WWM operations potentially trashing valid values in inactive lanes. Previously, the SIFixWWMLiveness pass would work out which registers were being trashed within WWM regions, and ensure that the register allocator did not have any values it was depending on resident in those registers if the WWM section would trash them. This worked perfectly well, but would cause sometimes severe register pressure when the WWM section resided before divergent control flow (or at least that is where I mostly observed it). This fix instead runs through the WWM sections and pre allocates some registers for WWM. It then reserves these registers so that the register allocator cannot use them. This results in a significant register saving on some WWM shaders I'm working with (130 -> 104 VGPRs, with just this change!). Differential Revision: https://reviews.llvm.org/D59295 llvm-svn: 357400 2019-04-01 23:19:52 +08:00			`; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63`
[AMDGPU] Add an AMDGPU specific atomic optimizer. This commit adds a new IR level pass to the AMDGPU backend to perform atomic optimizations. It works by: - Running through a function and finding atomicrmw add/sub or uses of the atomic buffer intrinsics for add/sub. - If all arguments except the value to be added/subtracted are uniform, record the value to be optimized. - Run through the atomic operations we can optimize and, depending on whether the value is uniform/divergent use wavefront wide operations (DPP in the divergent case) to calculate the total amount to be atomically added/subtracted. - Then let only a single lane of each wavefront perform the atomic operation, reducing the total number of atomic operations in flight. - Lastly we recombine the result from the single lane to each lane of the wavefront, and calculate our individual lanes offset into the final result. Differential Revision: https://reviews.llvm.org/D51969 llvm-svn: 343973 2018-10-08 23:49:19 +08:00			`; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]`
			`; GFX8MORE: buffer_atomic_sub v[[value]]`
			`define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%lane = call i32 @llvm.amdgcn.workitem.id.x()`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; GCN-LABEL: sub_i32_varying_offset:`
			`; GCN-NOT: v_mbcnt_lo_u32_b32`
			`; GCN-NOT: v_mbcnt_hi_u32_b32`
			`; GCN-NOT: s_bcnt1_i32_b64`
			`; GCN: buffer_atomic_sub v{{[0-9]+}}`
			`define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {`
			`entry:`
			`%lane = call i32 @llvm.amdgcn.workitem.id.x()`
			`%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)`
			`store i32 %old, i32 addrspace(1)* %out`
			`ret void`
			`}`