forked from OSchip/llvm-project
[AMDGPU] Fix DPP operand order in atomic optimizer
Summary: Ensure order of operands in DPP atomic optimizer final WWM step is appropriate for sub instructions. Change-Id: I631d050e1c00a3b4bc7c11a90437064403c4cf30 Reviewers: sheredom, tpr Reviewed By: sheredom Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D58900 llvm-svn: 355394
This commit is contained in:
parent
4511f3fa86
commit
9e3f7d8ad0
|
@ -311,7 +311,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||||
}
|
}
|
||||||
|
|
||||||
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
|
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
|
||||||
NewV = B.CreateBinOp(Op, NewV, SetInactive);
|
NewV = B.CreateBinOp(Op, SetInactive, NewV);
|
||||||
|
|
||||||
// Read the value from the last lane, which has accumlated the values of
|
// Read the value from the last lane, which has accumlated the values of
|
||||||
// each active lane in the wavefront. This will be our new value with which
|
// each active lane in the wavefront. This will be our new value with which
|
||||||
|
|
|
@ -112,7 +112,7 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
|
||||||
|
@ -120,7 +120,8 @@ entry:
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
|
||||||
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
||||||
|
|
|
@ -133,7 +133,9 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
|
; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
|
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
|
||||||
|
|
|
@ -136,7 +136,9 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
|
; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||||
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
||||||
|
|
|
@ -104,7 +104,9 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
|
; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
||||||
|
|
|
@ -117,7 +117,9 @@ entry:
|
||||||
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
|
||||||
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
; GFX7LESS-NOT: s_bcnt1_i32_b64
|
||||||
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
|
||||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
|
||||||
|
; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
|
||||||
|
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
|
||||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||||
; GFX8MORE: buffer_atomic_sub v[[value]]
|
; GFX8MORE: buffer_atomic_sub v[[value]]
|
||||||
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
|
||||||
|
|
Loading…
Reference in New Issue