[AMDGPU] Don't check for VMEM hazards on GFX10

The hazard where a VMEM reads an SGPR written by a VALU counts as a data
dependency hazard, so no nops are required on GFX10. Tested with Vulkan
CTS on GFX10.1 and GFX10.3.

Differential Revision: https://reviews.llvm.org/D97926
This commit is contained in:
Jay Foad 2021-03-03 16:46:53 +00:00
parent ba18a51c38
commit ed7458398a
11 changed files with 100 additions and 104 deletions

View File

@ -157,12 +157,6 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return HazardType;
// FIXME: Should flat be considered vmem?
if ((SIInstrInfo::isVMEM(*MI) ||
SIInstrInfo::isFLAT(*MI))
&& checkVMEMHazards(MI) > 0)
return HazardType;
if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
return HazardType;
@ -172,6 +166,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (ST.hasNoDataDepHazard())
return NoHazard;
// FIXME: Should flat be considered vmem?
if ((SIInstrInfo::isVMEM(*MI) ||
SIInstrInfo::isFLAT(*MI))
&& checkVMEMHazards(MI) > 0)
return HazardType;
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return HazardType;
@ -275,9 +275,6 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
if (ST.hasNSAtoVMEMBug())
WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
@ -286,6 +283,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (ST.hasNoDataDepHazard())
return WaitStates;
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
if (SIInstrInfo::isVALU(*MI))
WaitStates = std::max(WaitStates, checkVALUHazards(MI));

View File

@ -85,9 +85,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GCN-NEXT: v_readfirstlane_b32 s6, v16
; GCN-NEXT: v_readfirstlane_b32 s7, v17
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
; GCN-NEXT: s_nop 2
; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
; GCN-NEXT: s_and_saveexec_b32 s0, s0
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@ -125,9 +124,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GCN-NEXT: v_readfirstlane_b32 s6, v12
; GCN-NEXT: v_readfirstlane_b32 s7, v13
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GCN-NEXT: s_nop 2
; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
; GCN-NEXT: s_and_saveexec_b32 s0, s0
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@ -155,9 +153,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GCN-NEXT: v_readfirstlane_b32 s6, v17
; GCN-NEXT: v_readfirstlane_b32 s7, v18
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
; GCN-NEXT: s_nop 2
; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
; GCN-NEXT: s_and_saveexec_b32 s0, s0
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@ -195,9 +192,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GCN-NEXT: v_readfirstlane_b32 s6, v13
; GCN-NEXT: v_readfirstlane_b32 s7, v14
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GCN-NEXT: s_nop 2
; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
; GCN-NEXT: s_and_saveexec_b32 s0, s0
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0

View File

@ -131,7 +131,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -162,7 +161,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -518,7 +516,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -569,7 +566,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -754,7 +750,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -805,7 +800,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -990,7 +984,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -1041,7 +1034,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -1186,7 +1178,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -1219,7 +1210,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -2040,7 +2030,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -2091,7 +2080,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -2762,7 +2750,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -2813,7 +2800,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -2998,7 +2984,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -3049,7 +3034,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -3234,7 +3218,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -3285,7 +3268,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -3472,7 +3454,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -3525,7 +3506,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -3891,7 +3871,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -3944,7 +3923,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -4308,7 +4286,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -4359,7 +4336,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@ -4720,7 +4696,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_nop 0
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@ -4771,7 +4746,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_nop 0
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:

View File

@ -198,7 +198,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_nop 2
; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
@ -238,7 +237,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_nop 2
; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
@ -279,7 +277,6 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_nop 2
; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl0_inv
@ -318,7 +315,6 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_nop 2
; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl0_inv

View File

@ -788,17 +788,28 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
; Base pointer is uniform, but also in VGPRs
define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: ds_read_b64 v[1:2], v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: v_readfirstlane_b32 s1, v2
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_load_ubyte v0, v0, s[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_read_b64 v[1:2], v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_read_b64 v[1:2], v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
@ -810,17 +821,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
; Base pointer is uniform, but also in VGPRs, with imm offset
define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: ds_read_b64 v[1:2], v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: v_readfirstlane_b32 s1, v2
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_read_b64 v[1:2], v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_read_b64 v[1:2], v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset

View File

@ -58,16 +58,26 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspa
; Base pointer is uniform, but also in VGPRs
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: ds_read_b64 v[2:3], v2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s0, v2
; GCN-NEXT: v_readfirstlane_b32 s1, v3
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_store_byte v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_read_b64 v[2:3], v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: v_readfirstlane_b32 s1, v3
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ds_read_b64 v[2:3], v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v2
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
@ -77,16 +87,26 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
; Base pointer is uniform, but also in VGPRs, with imm offset
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) {
; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: ds_read_b64 v[2:3], v2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s0, v2
; GCN-NEXT: v_readfirstlane_b32 s1, v3
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_store_byte v0, v1, s[0:1] offset:-120
; GCN-NEXT: s_endpgm
; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_read_b64 v[2:3], v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: v_readfirstlane_b32 s1, v3
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:-120
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ds_read_b64 v[2:3], v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v2
; GFX10-NEXT: v_readfirstlane_b32 s1, v3
; GFX10-NEXT: global_store_byte v0, v1, s[0:1] offset:-120
; GFX10-NEXT: s_endpgm
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset

View File

@ -40,7 +40,7 @@ body: |
# GCN-LABEL: name: vmem_vcc_hazard_ignore_bundle_instr
# GCN: S_LOAD_DWORDX2_IMM
# GCN-NEXT: }
# GCN-NEXT: S_NOP 3
# GFX9-NEXT: S_NOP 3
# GCN: BUFFER_LOAD_DWORD_OFFEN
---
name: vmem_vcc_hazard_ignore_bundle_instr
@ -60,7 +60,7 @@ body: |
# GCN-LABEL: name: vmem_vcc_min_of_two_after_bundle
# GCN: bb.2:
# GCN-NEXT: S_NOP 4
# GFX9-NEXT: S_NOP 4
# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
---
name: vmem_vcc_min_of_two_after_bundle

View File

@ -67,7 +67,7 @@ body: |
# GCN-LABEL: name: vmem_vcc_hazard_in_bundle
# GCN: S_LOAD_DWORDX2_IMM
# GCN-NEXT: S_NOP 3
# GFX9-NEXT: S_NOP 3
# GCN: BUFFER_LOAD_DWORD_OFFEN
---
name: vmem_vcc_hazard_in_bundle

View File

@ -16,7 +16,6 @@
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
; W64: s_nop 0
; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[AND]]
; W64: s_cbranch_execnz [[LOOPBB]]
@ -34,7 +33,6 @@
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
; W32: s_nop 0
; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
; W32: s_cbranch_execnz [[LOOPBB]]
@ -59,7 +57,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
; W64: s_nop 0
; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[SAVE]]
; W64: s_cbranch_execnz [[LOOPBB0]]
@ -77,7 +74,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
; W64: s_nop 0
; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[SAVE]]
; W64: s_cbranch_execnz [[LOOPBB1]]
@ -99,7 +95,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
; W32: s_nop 0
; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
; W32: s_cbranch_execnz [[LOOPBB0]]
@ -117,7 +112,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
; W32: s_nop 0
; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
; W32: s_cbranch_execnz [[LOOPBB1]]
@ -150,7 +144,6 @@ entry:
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
; W64: s_nop 0
; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[SAVE]]
; W64: s_cbranch_execnz [[LOOPBB0]]
@ -171,7 +164,6 @@ entry:
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
; W64: s_nop 0
; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[SAVE]]
; W64: s_cbranch_execnz [[LOOPBB1]]
@ -196,7 +188,6 @@ entry:
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
; W32: s_nop 0
; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
; W32: s_cbranch_execnz [[LOOPBB0]]
@ -217,7 +208,6 @@ entry:
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
; W32: s_nop 0
; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
; W32: s_cbranch_execnz [[LOOPBB1]]

View File

@ -67,7 +67,6 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
; FLATSCR-NEXT: s_cbranch_execz BB0_2
; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i
; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000
; FLATSCR-NEXT: s_nop 1
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0

View File

@ -26,7 +26,6 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GCN-NEXT: s_and_b32 s4, vcc_lo, s4
; GCN-NEXT: s_and_saveexec_b32 s4, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4