[AMDGPU] Turn on the new waitcnt insertion pass. Adjust tests.

-enable-si-insert-waitcnts=1 becomes the default
-enable-si-insert-waitcnts=0 to use old pass

Differential Revision: https://reviews.llvm.org/D33730

llvm-svn: 304551
This commit is contained in:
Mark Searles 2017-06-02 14:19:25 +00:00
parent 2aae0649a1
commit 70359ac60d
22 changed files with 22 additions and 46 deletions

View File

@ -116,7 +116,7 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
static cl::opt<bool> EnableSIInsertWaitcntsPass(
"enable-si-insert-waitcnts",
cl::desc("Use new waitcnt insertion pass"),
cl::init(false));
cl::init(true));
// Option to run late CFG structurizer
static cl::opt<bool> LateCFGStructurize(

View File

@ -34,8 +34,6 @@ end:
; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
; GCN: buffer_store_dword
; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; TODO: This waitcnt can be eliminated
; GCN: {{^}}[[END]]:
; GCN: s_endpgm

View File

@ -19,9 +19,8 @@
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
; GCN: ds_write_b32
; GCN: s_waitcnt
; GCN-NEXT: [[BB5]]
; GCN: [[BB5]]
; GCN: s_or_b64 exec, exec
; GCN-NEXT: s_endpgm
; GCN-NEXT: .Lfunc_end

View File

@ -223,7 +223,6 @@ bb3:
; GCN-NEXT: [[BB2]]: ; %bb2
; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
; GCN: buffer_store_dword [[BB2_K]]
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
; GCN-NEXT: s_getpc_b64 vcc
@ -393,7 +392,6 @@ bb3:
; GCN-NEXT: ; BB#2: ; %if_uniform
; GCN: buffer_store_dword
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: [[ENDIF]]: ; %endif
; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]

View File

@ -37,22 +37,21 @@
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
; GCN: {{^}}BB{{[0-9]+}}_1: ; %if
; GCN: s_mov_b32 m0, -1
; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
; GCN: s_waitcnt lgkmcnt(0)
; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: s_waitcnt vmcnt(0)
; Spill val register
; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0)
; VMEM: [[ENDIF]]:
; Reload and restore exec mask
; VGPR: s_waitcnt lgkmcnt(0)
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@ -119,7 +118,6 @@ endif:
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]]
; GCN-NEXT: s_cbranch_execz [[END]]
@ -130,7 +128,6 @@ endif:
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: s_and_b64 vcc, exec, vcc
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
@ -197,7 +194,6 @@ end:
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, [[CMP0]]
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; FIXME: It makes no sense to put this skip here
; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
@ -235,7 +231,6 @@ end:
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
; GCN-NEXT: s_cbranch_execz [[ENDIF]]
@ -245,14 +240,12 @@ end:
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
; GCN: [[ELSE]]: ; %else
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_branch [[FLOW]]
; GCN: [[ENDIF]]:

View File

@ -120,8 +120,7 @@ entry:
; FIXME: The waitcnt for the argument load can go after the loop
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
; GCN: s_waitcnt lgkmcnt(0)
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
@ -250,8 +249,6 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt lgkmcnt(0)
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
@ -290,7 +287,6 @@ entry:
; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt lgkmcnt(0)
; The offset depends on the register that holds the first element of the vector.
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
@ -330,9 +326,9 @@ entry:
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt vmcnt(0)
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
@ -411,6 +407,7 @@ bb2:
; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]

View File

@ -4,8 +4,8 @@
; SI-LABEL: {{^}}infinite_loop:
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
; SI: BB0_1:
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG]]
; SI: s_waitcnt vmcnt(0) expcnt(0)
; SI: s_branch BB0_1
define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
entry:

View File

@ -58,7 +58,7 @@ main_body:
;
;CHECK-LABEL: {{^}}buffer_store_wait:
;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
;CHECK: s_waitcnt vmcnt(0) expcnt(0)
;CHECK: s_waitcnt expcnt(0)
;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen

View File

@ -58,7 +58,7 @@ main_body:
;
;CHECK-LABEL: {{^}}buffer_store_wait:
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
;CHECK: s_waitcnt vmcnt(0) expcnt(0)
;CHECK: s_waitcnt expcnt(0)
;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen

View File

@ -5,7 +5,6 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
; FUNC-LABEL: {{^}}ds_swizzle:
; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11")
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
%swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
store i32 %swizzle, i32 addrspace(1)* %out, align 4

View File

@ -130,7 +130,7 @@ main_body:
;
; GCN-LABEL: {{^}}image_store_wait:
; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN: s_waitcnt expcnt(0)
; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0)
; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm

View File

@ -20,7 +20,7 @@ define amdgpu_kernel void @test_s_dcache_inv() #0 {
; GCN: s_waitcnt lgkmcnt(0) ; encoding
define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.inv()
call void @llvm.amdgcn.s.waitcnt(i32 0)
call void @llvm.amdgcn.s.waitcnt(i32 127)
br label %end
end:

View File

@ -20,7 +20,7 @@ define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
; GCN: s_waitcnt lgkmcnt(0) ; encoding
define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.inv.vol()
call void @llvm.amdgcn.s.waitcnt(i32 0)
call void @llvm.amdgcn.s.waitcnt(i32 127)
br label %end
end:

View File

@ -18,7 +18,7 @@ define amdgpu_kernel void @test_s_dcache_wb() #0 {
; VI: s_waitcnt lgkmcnt(0) ; encoding
define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.wb()
call void @llvm.amdgcn.s.waitcnt(i32 0)
call void @llvm.amdgcn.s.waitcnt(i32 127)
br label %end
end:

View File

@ -18,7 +18,7 @@ define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
; VI: s_waitcnt lgkmcnt(0) ; encoding
define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.wb.vol()
call void @llvm.amdgcn.s.waitcnt(i32 0)
call void @llvm.amdgcn.s.waitcnt(i32 127)
br label %end
end:

View File

@ -18,8 +18,8 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float>
;
; CHECK-LABEL: {{^}}test2:
; CHECK: image_load
; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
; CHECK: s_waitcnt
; CHECK-NEXT: s_waitcnt
; CHECK: s_waitcnt vmcnt(0){{$}}
; CHECK-NEXT: image_store
define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
%t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)

View File

@ -362,6 +362,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1
; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
; GCN-NEXT: s_or_b64 exec, exec
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: ; return
define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {

View File

@ -65,7 +65,6 @@ ret.bb: ; preds = %else, %main_body
; GCN-NEXT: ; %unreachable.bb
; GCN: ds_write_b32
; GCN: s_waitcnt
; GCN: ; divergent unreachable
; GCN: ; %ret.bb
@ -73,6 +72,7 @@ ret.bb: ; preds = %else, %main_body
; GCN: ; %UnifiedReturnBlock
; GCN-NEXT: s_or_b64 exec, exec
; GCN-NEXT: s_waitcnt
; GCN-NEXT: ; return
; GCN-NEXT: .Lfunc_end
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {

View File

@ -9,7 +9,6 @@
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
; GCN: ; divergent unreachable
; GCN: s_waitcnt
; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
; GCN-NEXT: s_or_b64 exec, exec
@ -38,7 +37,6 @@ ret:
; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
; GCN: ; divergent unreachable
; GCN: s_waitcnt
; GCN: [[RETURN]]:
; GCN-NEXT: s_or_b64 exec, exec
@ -66,7 +64,6 @@ unreachable:
; GCN: [[UNREACHABLE]]:
; GCN: ds_write_b32
; GCN: s_waitcnt
define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
bb:
%tmp63 = icmp eq i32 %arg0, 32

View File

@ -5,7 +5,7 @@
; GCN-FUNC: {{^}}vccz_workaround:
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_waitcnt lgkmcnt(0)
; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VCCZ-BUG: s_mov_b64 vcc, vcc
; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]

View File

@ -18,13 +18,11 @@
; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
; TOVMEM: s_waitcnt vmcnt(0)
; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
; TOSMEM-NOT: [[M0_COPY]]
; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
; TOSMEM: s_waitcnt lgkmcnt(0)
; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]

View File

@ -11,7 +11,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI: v_cmp_lt_i32_e32 vcc, 0,
; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
@ -72,7 +71,6 @@ end:
; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
; SI: buffer_store_dword
; SI-NEXT: s_waitcnt
; SI-NEXT: {{^}}[[EXIT]]:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
@ -101,7 +99,6 @@ exit:
; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
; SI: buffer_store_dword
; SI-NEXT: s_waitcnt
; SI-NEXT: {{^}}[[EXIT]]:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
@ -132,7 +129,6 @@ exit:
; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
; SI: ds_write_b32
; SI: s_waitcnt
; SI-NEXT: {{^}}[[FLOW]]:
; SI-NEXT: s_or_saveexec_b64
@ -140,8 +136,8 @@ exit:
; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
; SI: buffer_store_dword
; SI-NEXT: s_waitcnt
; SI: s_waitcnt
; SI-NEXT: buffer_store_dword
; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
; SI: s_or_b64 exec, exec