diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0482865bd518..f5db222fc05b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -157,12 +157,6 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return HazardType; - // FIXME: Should flat be considered vmem? - if ((SIInstrInfo::isVMEM(*MI) || - SIInstrInfo::isFLAT(*MI)) - && checkVMEMHazards(MI) > 0) - return HazardType; - if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) return HazardType; @@ -172,6 +166,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (ST.hasNoDataDepHazard()) return NoHazard; + // FIXME: Should flat be considered vmem? + if ((SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI)) + && checkVMEMHazards(MI) > 0) + return HazardType; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) return HazardType; @@ -275,9 +275,6 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) - WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (ST.hasNSAtoVMEMBug()) WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); @@ -286,6 +283,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (ST.hasNoDataDepHazard()) return WaitStates; + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) WaitStates = std::max(WaitStates, checkVALUHazards(MI)); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index f363b4ba0f86..bc95c364f891 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -85,9 +85,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GCN-NEXT: v_readfirstlane_b32 s6, v16 ; GCN-NEXT: v_readfirstlane_b32 s7, v17 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] -; GCN-NEXT: s_nop 2 ; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7] +; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] ; GCN-NEXT: s_and_b32 s0, s0, vcc_lo ; GCN-NEXT: s_and_saveexec_b32 s0, s0 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -125,9 +124,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GCN-NEXT: v_readfirstlane_b32 s6, v12 ; GCN-NEXT: v_readfirstlane_b32 s7, v13 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] -; GCN-NEXT: s_nop 2 ; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16 +; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GCN-NEXT: s_and_b32 s0, s0, vcc_lo ; GCN-NEXT: s_and_saveexec_b32 s0, s0 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -155,9 +153,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GCN-NEXT: v_readfirstlane_b32 s6, v17 ; GCN-NEXT: v_readfirstlane_b32 s7, v18 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] -; GCN-NEXT: s_nop 2 ; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] +; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] ; GCN-NEXT: s_and_b32 s0, s0, vcc_lo ; GCN-NEXT: s_and_saveexec_b32 s0, s0 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -195,9 +192,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GCN-NEXT: v_readfirstlane_b32 s6, v13 ; GCN-NEXT: v_readfirstlane_b32 s7, v14 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] -; GCN-NEXT: s_nop 2 ; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16 +; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GCN-NEXT: s_and_b32 s0, s0, vcc_lo ; GCN-NEXT: s_and_saveexec_b32 s0, s0 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 0dd84a9c5eb8..1753cdcf407b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -131,7 +131,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -162,7 +161,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -518,7 +516,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -569,7 +566,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -754,7 +750,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -805,7 +800,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -990,7 +984,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1041,7 +1034,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1186,7 +1178,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1219,7 +1210,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2040,7 +2030,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2091,7 +2080,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2762,7 +2750,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2813,7 +2800,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2998,7 +2984,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3049,7 +3034,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3234,7 +3218,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3285,7 +3268,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3472,7 +3454,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3525,7 +3506,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3891,7 +3871,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3944,7 +3923,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4308,7 +4286,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4359,7 +4336,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4720,7 +4696,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4771,7 +4746,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index b511d5917945..af3a6a868981 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -198,7 +198,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -238,7 +237,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -279,7 +277,6 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -318,7 +315,6 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index f28b9fdcdccc..1e58d6dbba05 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -788,17 +788,28 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_ ; Base pointer is uniform, but also in VGPRs define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { -; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs: -; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: ds_read_b64 v[1:2], v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_b64 v[1:2], v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: s_nop 4 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_read_b64 v[1:2], v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -810,17 +821,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { -; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: -; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: ds_read_b64 v[1:2], v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_b64 v[1:2], v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: s_nop 4 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_read_b64 v[1:2], v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index c50efde84520..2b3e6a4e9c31 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -58,16 +58,26 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspa ; Base pointer is uniform, but also in VGPRs define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { -; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs: -; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: ds_read_b64 v[2:3], v2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: v_readfirstlane_b32 s1, v3 -; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_byte v0, v1, s[0:1] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -77,16 +87,26 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { -; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: -; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: ds_read_b64 v[2:3], v2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: v_readfirstlane_b32 s1, v3 -; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_byte v0, v1, s[0:1] offset:-120 -; GCN-NEXT: s_endpgm +; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:-120 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] offset:-120 +; GFX10-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir index e36f4bd63387..496416fd9864 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -40,7 +40,7 @@ body: | # GCN-LABEL: name: vmem_vcc_hazard_ignore_bundle_instr # GCN: S_LOAD_DWORDX2_IMM # GCN-NEXT: } -# GCN-NEXT: S_NOP 3 +# GFX9-NEXT: S_NOP 3 # GCN: BUFFER_LOAD_DWORD_OFFEN --- name: vmem_vcc_hazard_ignore_bundle_instr @@ -60,7 +60,7 @@ body: | # GCN-LABEL: name: vmem_vcc_min_of_two_after_bundle # GCN: bb.2: -# GCN-NEXT: S_NOP 4 +# GFX9-NEXT: S_NOP 4 # GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: vmem_vcc_min_of_two_after_bundle diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir index 48f26c1e0b9b..7f99b4af154a 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -67,7 +67,7 @@ body: | # GCN-LABEL: name: vmem_vcc_hazard_in_bundle # GCN: S_LOAD_DWORDX2_IMM -# GCN-NEXT: S_NOP 3 +# GFX9-NEXT: S_NOP 3 # GCN: BUFFER_LOAD_DWORD_OFFEN --- name: vmem_vcc_hazard_in_bundle diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 4dfc9bce69aa..787877916783 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -16,7 +16,6 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: s_nop 0 ; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[AND]] ; W64: s_cbranch_execnz [[LOOPBB]] @@ -34,7 +33,6 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: s_nop 0 ; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB]] @@ -59,7 +57,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: s_nop 0 ; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[SAVE]] ; W64: s_cbranch_execnz [[LOOPBB0]] @@ -77,7 +74,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: s_nop 0 ; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[SAVE]] ; W64: s_cbranch_execnz [[LOOPBB1]] @@ -99,7 +95,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: s_nop 0 ; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB0]] @@ -117,7 +112,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: s_nop 0 ; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB1]] @@ -150,7 +144,6 @@ entry: ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: s_nop 0 ; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[SAVE]] ; W64: s_cbranch_execnz [[LOOPBB0]] @@ -171,7 +164,6 @@ entry: ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: s_nop 0 ; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[SAVE]] ; W64: s_cbranch_execnz [[LOOPBB1]] @@ -196,7 +188,6 @@ entry: ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: s_nop 0 ; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB0]] @@ -217,7 +208,6 @@ entry: ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: s_nop 0 ; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB1]] diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index c743281440c5..c48b3e185e19 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -67,7 +67,6 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; FLATSCR-NEXT: s_cbranch_execz BB0_2 ; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 -; FLATSCR-NEXT: s_nop 1 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 3a18f4680466..ede567e58b69 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -26,7 +26,6 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 { ; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 -; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4