forked from OSchip/llvm-project
[AMDGPU] Keep skip branch for ds instructions
Same as other memory instructions, ds instructions add latency even if exec is zero. Jumping over them if exec=0 is cheaper than executing them. With this change, the branch instruction that skips over a basic block if exec=0 is not removed when the block contains a ds instruction. Differential Revision: https://reviews.llvm.org/D97922
This commit is contained in:
parent
9b302513f6
commit
e0e73714fb
|
@ -100,7 +100,7 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
|
||||||
|
|
||||||
// These instructions are potentially expensive even if EXEC = 0.
|
// These instructions are potentially expensive even if EXEC = 0.
|
||||||
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
|
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
|
||||||
I->getOpcode() == AMDGPU::S_WAITCNT)
|
TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
++NumInstr;
|
++NumInstr;
|
||||||
|
|
|
@ -14,11 +14,12 @@
|
||||||
; GCN-DAG: v_cmp_lt_f32_e32 vcc,
|
; GCN-DAG: v_cmp_lt_f32_e32 vcc,
|
||||||
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
|
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
|
||||||
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
|
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
|
||||||
|
; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
|
||||||
|
|
||||||
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4
|
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4
|
||||||
; GCN: ds_write_b32
|
; GCN: ds_write_b32
|
||||||
|
|
||||||
; GCN: ; %bb.{{[0-9]+}}:
|
; GCN: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
|
||||||
; GCN-NEXT: s_endpgm
|
; GCN-NEXT: s_endpgm
|
||||||
; GCN-NEXT: .Lfunc_end
|
; GCN-NEXT: .Lfunc_end
|
||||||
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
|
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
|
||||||
|
|
|
@ -56,3 +56,31 @@ body: |
|
||||||
bb.2:
|
bb.2:
|
||||||
S_ENDPGM 0
|
S_ENDPGM 0
|
||||||
...
|
...
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
name: skip_execz_ds
|
||||||
|
body: |
|
||||||
|
; CHECK-LABEL: name: skip_execz_ds
|
||||||
|
; CHECK: bb.0:
|
||||||
|
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||||
|
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
|
||||||
|
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||||
|
; CHECK: bb.1:
|
||||||
|
; CHECK: successors: %bb.2(0x80000000)
|
||||||
|
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
; CHECK: DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
|
||||||
|
; CHECK: bb.2:
|
||||||
|
; CHECK: S_ENDPGM 0
|
||||||
|
bb.0:
|
||||||
|
successors: %bb.1, %bb.2
|
||||||
|
SI_MASK_BRANCH %bb.2, implicit $exec
|
||||||
|
|
||||||
|
bb.1:
|
||||||
|
successors: %bb.2
|
||||||
|
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
|
||||||
|
|
||||||
|
bb.2:
|
||||||
|
S_ENDPGM 0
|
||||||
|
...
|
|
@ -65,6 +65,7 @@ ret.bb: ; preds = %else, %main_body
|
||||||
|
|
||||||
; GCN: BB{{[0-9]+_[0-9]+}}: ; %else
|
; GCN: BB{{[0-9]+_[0-9]+}}: ; %else
|
||||||
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
|
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
|
||||||
|
; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
|
||||||
|
|
||||||
; GCN-NEXT: ; %unreachable.bb
|
; GCN-NEXT: ; %unreachable.bb
|
||||||
; GCN: ds_write_b32
|
; GCN: ds_write_b32
|
||||||
|
|
|
@ -3,12 +3,13 @@
|
||||||
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
|
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
|
||||||
; GCN: v_cmp_eq_u32
|
; GCN: v_cmp_eq_u32
|
||||||
; GCN: s_and_saveexec_b64
|
; GCN: s_and_saveexec_b64
|
||||||
|
; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
|
||||||
|
|
||||||
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
|
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
|
||||||
; GCN: ds_write_b32
|
; GCN: ds_write_b32
|
||||||
; GCN: ; divergent unreachable
|
; GCN: ; divergent unreachable
|
||||||
|
|
||||||
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %UnifiedReturnBlock
|
; GCN-NEXT: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
|
|
||||||
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
|
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
|
||||||
|
@ -28,12 +29,13 @@ ret:
|
||||||
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
|
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
|
||||||
; GCN: v_cmp_ne_u32
|
; GCN: v_cmp_ne_u32
|
||||||
; GCN: s_and_saveexec_b64
|
; GCN: s_and_saveexec_b64
|
||||||
|
; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
|
||||||
|
|
||||||
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
|
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
|
||||||
; GCN: ds_write_b32
|
; GCN: ds_write_b32
|
||||||
; GCN: ; divergent unreachable
|
; GCN: ; divergent unreachable
|
||||||
|
|
||||||
; GCN: ; %bb.{{[0-9]+}}:
|
; GCN: BB1_{{[0-9]+}}:
|
||||||
; GCN-NEXT: s_endpgm
|
; GCN-NEXT: s_endpgm
|
||||||
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
|
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
|
||||||
bb:
|
bb:
|
||||||
|
|
Loading…
Reference in New Issue