forked from OSchip/llvm-project
[AMDGPU] Increase detection range for s_mov, v_cmpx transformation.
We found that it might be beneficial to have the SIOptimizeExecMasking pass detect more cases where v_cmp, s_and_saveexec patterns can be transformed to s_mov, v_cmpx patterns. Currently, the search range for finding a fitting v_cmp instruction is 5, however, this is doubled to 10 here. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D123367
This commit is contained in:
parent
08920cc043
commit
6d97ca690c
|
@ -302,12 +302,15 @@ static MachineInstr *
|
||||||
findInstrBackwards(MachineInstr &Origin,
|
findInstrBackwards(MachineInstr &Origin,
|
||||||
std::function<bool(MachineInstr *)> Pred,
|
std::function<bool(MachineInstr *)> Pred,
|
||||||
ArrayRef<MCRegister> NonModifiableRegs,
|
ArrayRef<MCRegister> NonModifiableRegs,
|
||||||
const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
|
const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
|
||||||
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
|
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
|
||||||
E = Origin.getParent()->rend();
|
E = Origin.getParent()->rend();
|
||||||
unsigned CurrentIteration = 0;
|
unsigned CurrentIteration = 0;
|
||||||
|
|
||||||
for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
|
for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
|
||||||
|
if (A->isDebugInstr())
|
||||||
|
continue;
|
||||||
|
|
||||||
if (Pred(&*A))
|
if (Pred(&*A))
|
||||||
return &*A;
|
return &*A;
|
||||||
|
|
||||||
|
@ -315,7 +318,7 @@ findInstrBackwards(MachineInstr &Origin,
|
||||||
if (A->modifiesRegister(Reg, TRI))
|
if (A->modifiesRegister(Reg, TRI))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
++CurrentIteration;
|
++CurrentIteration;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
|
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
|
# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
|
# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
|
||||||
|
@ -62,3 +61,29 @@ body: |
|
||||||
$sgpr2_sgpr3 = COPY $exec, implicit-def $exec
|
$sgpr2_sgpr3 = COPY $exec, implicit-def $exec
|
||||||
$sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
|
$sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
$exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
|
$exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
|
||||||
|
...
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Check if the sequence will be optimized even with more than 5 (unrelated) instructions inbetween the v_cmp and s_and_saveexec.
|
||||||
|
|
||||||
|
# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_check_many_instrs
|
||||||
|
# GFX1010: V_CMP_LT_F32_e64
|
||||||
|
# GFX1010: S_AND_SAVEEXEC_B64
|
||||||
|
# GFX1030: S_MOV_B64
|
||||||
|
# GFX1030: V_CMPX_LT_F32_nosdst_e64 0, 953267991, 2
|
||||||
|
name: vcmp_saveexec_to_mov_vcmpx_check_many_instrs
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $sgpr2, $vgpr1
|
||||||
|
renamable $sgpr0_sgpr1 = V_CMP_LT_F32_e64 0, 953267991, 2, $vgpr0, 0, implicit $mode, implicit $exec
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
|
||||||
|
$sgpr2_sgpr3 = COPY $exec, implicit-def $exec
|
||||||
|
$sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
|
||||||
|
$exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
|
||||||
|
|
Loading…
Reference in New Issue