[AMDGPU] Keep skip branch for ds instructions

Same as other memory instructions, ds instructions add latency even if
exec is zero. Jumping over them if exec=0 is cheaper than executing
them.
With this change, the branch instruction that skips over a basic block
if exec=0 is not removed when the block contains a ds instruction.

Differential Revision: https://reviews.llvm.org/D97922
This commit is contained in:
Sebastian Neubauer 2021-03-04 10:39:42 +01:00
parent 9b302513f6
commit e0e73714fb
5 changed files with 36 additions and 4 deletions

View File

@ -100,7 +100,7 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
// These instructions are potentially expensive even if EXEC = 0. // These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
I->getOpcode() == AMDGPU::S_WAITCNT) TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
return true; return true;
++NumInstr; ++NumInstr;

View File

@ -14,11 +14,12 @@
; GCN-DAG: v_cmp_lt_f32_e32 vcc, ; GCN-DAG: v_cmp_lt_f32_e32 vcc,
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4 ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4
; GCN: ds_write_b32 ; GCN: ds_write_b32
; GCN: ; %bb.{{[0-9]+}}: ; GCN: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
; GCN-NEXT: .Lfunc_end ; GCN-NEXT: .Lfunc_end
define amdgpu_ps void @ham(float %arg, float %arg1) #0 { define amdgpu_ps void @ham(float %arg, float %arg1) #0 {

View File

@ -56,3 +56,31 @@ body: |
bb.2: bb.2:
S_ENDPGM 0 S_ENDPGM 0
... ...
---
name: skip_execz_ds
body: |
; CHECK-LABEL: name: skip_execz_ds
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
SI_MASK_BRANCH %bb.2, implicit $exec
bb.1:
successors: %bb.2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
bb.2:
S_ENDPGM 0
...

View File

@ -65,6 +65,7 @@ ret.bb: ; preds = %else, %main_body
; GCN: BB{{[0-9]+_[0-9]+}}: ; %else ; GCN: BB{{[0-9]+_[0-9]+}}: ; %else
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
; GCN-NEXT: ; %unreachable.bb ; GCN-NEXT: ; %unreachable.bb
; GCN: ds_write_b32 ; GCN: ds_write_b32

View File

@ -3,12 +3,13 @@
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
; GCN: v_cmp_eq_u32 ; GCN: v_cmp_eq_u32
; GCN: s_and_saveexec_b64 ; GCN: s_and_saveexec_b64
; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
; GCN: ds_write_b32 ; GCN: ds_write_b32
; GCN: ; divergent unreachable ; GCN: ; divergent unreachable
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %UnifiedReturnBlock ; GCN-NEXT: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
; GCN: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 { define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
@ -28,12 +29,13 @@ ret:
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
; GCN: v_cmp_ne_u32 ; GCN: v_cmp_ne_u32
; GCN: s_and_saveexec_b64 ; GCN: s_and_saveexec_b64
; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
; GCN: ds_write_b32 ; GCN: ds_write_b32
; GCN: ; divergent unreachable ; GCN: ; divergent unreachable
; GCN: ; %bb.{{[0-9]+}}: ; GCN: BB1_{{[0-9]+}}:
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
bb: bb: