llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixels...

; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

declare i1 @llvm.amdgcn.wqm.vote(i1)
declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)

; Show that what the atomic optimization pass will do for raw buffers.

; GCN-LABEL: add_i32_constant:
; GCN-LABEL: BB0_1:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
; GCN: buffer_atomic_add v[[value]]
; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
entry:
  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
  %cond = and i1 %cond1, %cond2
  br i1 %cond, label %if, label %else
if:
  %bitcast = bitcast i32 %old to float
  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
  ret void
else:
  ret void
}

; GCN-LABEL: add_i32_varying:
; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
entry:
  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)
  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
  %cond = and i1 %cond1, %cond2
  br i1 %cond, label %if, label %else
if:
  %bitcast = bitcast i32 %old to float
  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
  ret void
else:
  ret void
}
[AMDGPU] Fix the new atomic optimizer in pixel shaders. The new atomic optimizer I previously added in D51969 did not work correctly when a pixel shader was using derivatives, and had helper lanes active. To fix this we add an llvm.amdgcn.ps.live call that guards a branch around the entire atomic operation - ensuring that all helper lanes are inactive within the wavefront when we compute our atomic results. I've added a test case that can cause derivatives, and exposes the problem. Differential Revision: https://reviews.llvm.org/D53930 llvm-svn: 346128 2018-11-05 20:04:48 +08:00			`; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s`
			`; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s`
			`; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s`

			`declare i1 @llvm.amdgcn.wqm.vote(i1)`
			`declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)`
			`declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)`

			`; Show that what the atomic optimization pass will do for raw buffers.`

			`; GCN-LABEL: add_i32_constant:`
			`; GCN-LABEL: BB0_1:`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
[AMDGPU] Fix the new atomic optimizer in pixel shaders. The new atomic optimizer I previously added in D51969 did not work correctly when a pixel shader was using derivatives, and had helper lanes active. To fix this we add an llvm.amdgcn.ps.live call that guards a branch around the entire atomic operation - ensuring that all helper lanes are inactive within the wavefront when we compute our atomic results. I've added a test case that can cause derivatives, and exposes the problem. Differential Revision: https://reviews.llvm.org/D53930 llvm-svn: 346128 2018-11-05 20:04:48 +08:00			`; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
			`; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}`
			`; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5`
			`; GCN: buffer_atomic_add v[[value]]`
			`; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]`
			`define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {`
			`entry:`
			`%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)`
			`%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)`
			`%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)`
			`%cond = and i1 %cond1, %cond2`
			`br i1 %cond, label %if, label %else`
			`if:`
			`%bitcast = bitcast i32 %old to float`
			`call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)`
			`ret void`
			`else:`
			`ret void`
			`}`

			`; GCN-LABEL: add_i32_varying:`
			`; GFX7LESS-NOT: v_mbcnt_lo_u32_b32`
			`; GFX7LESS-NOT: v_mbcnt_hi_u32_b32`
[AMDGPU] Fix DPP sequence in atomic optimizer. This commit fixes the DPP sequence in the atomic optimizer (which was previously missing the row_shr:3 step), and works around a read_register exec bug by using a ballot instead. Differential Revision: https://reviews.llvm.org/D57737 llvm-svn: 353703 2019-02-11 22:44:14 +08:00			`; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0`
			`; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0`
			`; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]`
[AMDGPU] Fix DPP combiner check for exec modification Summary: r363675 changed the exec modification helper function, now called execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks all instructions in the basic block, even beyond the last use. That meant that the DPP combiner no longer worked in any basic block that ended with a control flow instruction, and in particular it didn't work on code sequences generated by the atomic optimizer. Fix it by reinstating the old behaviour but in a new helper function execMayBeModifiedBeforeAnyUse, and limiting the number of instructions scanned. Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64393 llvm-svn: 365910 2019-07-12 23:59:40 +08:00			`; GFX8MORE: v_add_u32_dpp`
			`; GFX8MORE: v_add_u32_dpp`
[AMDGPU] Fix the new atomic optimizer in pixel shaders. The new atomic optimizer I previously added in D51969 did not work correctly when a pixel shader was using derivatives, and had helper lanes active. To fix this we add an llvm.amdgcn.ps.live call that guards a branch around the entire atomic operation - ensuring that all helper lanes are inactive within the wavefront when we compute our atomic results. I've added a test case that can cause derivatives, and exposes the problem. Differential Revision: https://reviews.llvm.org/D53930 llvm-svn: 346128 2018-11-05 20:04:48 +08:00			`; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63`
			`; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]`
			`; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]`
			`; GFX8MORE: buffer_atomic_add v[[value]]`
			`; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]`
			`define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {`
			`entry:`
			`%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)`
			`%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)`
			`%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)`
			`%cond = and i1 %cond1, %cond2`
			`br i1 %cond, label %if, label %else`
			`if:`
			`%bitcast = bitcast i32 %old to float`
			`call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)`
			`ret void`
			`else:`
			`ret void`
			`}`