forked from OSchip/llvm-project
[AMDGPU] Remove fix up operand from SI_ELSE
Remove immediate operand from SI_ELSE which indicates if EXEC has been modified. Instead always emit code that handles EXEC and remove unnecessary instructions during pre-RA optimisation. This facilitates passes (i.e. SIWholeQuadMode) adding exec mask manipulation post control flow lowering, and pre control flow lower passes do not need to be aware of SI_ELSE handling. Reviewed By: nhaehnle Differential Revision: https://reviews.llvm.org/D89644
This commit is contained in:
parent
d99b2a976a
commit
be2afbd019
|
@ -4625,10 +4625,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
|||
.addMBB(UncondBrTarget);
|
||||
} else {
|
||||
B.buildInstr(AMDGPU::SI_ELSE)
|
||||
.addDef(Def)
|
||||
.addUse(Use)
|
||||
.addMBB(UncondBrTarget)
|
||||
.addImm(0);
|
||||
.addDef(Def)
|
||||
.addUse(Use)
|
||||
.addMBB(UncondBrTarget);
|
||||
}
|
||||
|
||||
if (Br) {
|
||||
|
|
|
@ -325,7 +325,7 @@ def SI_IF: CFPseudoInstSI <
|
|||
|
||||
def SI_ELSE : CFPseudoInstSI <
|
||||
(outs SReg_1:$dst),
|
||||
(ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
|
||||
(ins SReg_1:$src, brtarget:$target), [], 1, 1> {
|
||||
let Size = 12;
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
@ -745,7 +745,7 @@ def : GCNPat<
|
|||
|
||||
def : GCNPat<
|
||||
(AMDGPUelse i1:$src, bb:$target),
|
||||
(SI_ELSE $src, $target, 0)
|
||||
(SI_ELSE $src, $target)
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
|
|
|
@ -333,13 +333,11 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
|
|||
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
|
||||
bool ExecModified = MI.getOperand(3).getImm() != 0;
|
||||
MachineBasicBlock::iterator Start = MBB.begin();
|
||||
|
||||
// This must be inserted before phis and any spill code inserted before the
|
||||
// else.
|
||||
Register SaveReg = ExecModified ?
|
||||
MRI->createVirtualRegister(BoolRC) : DstReg;
|
||||
Register SaveReg = MRI->createVirtualRegister(BoolRC);
|
||||
MachineInstr *OrSaveExec =
|
||||
BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
|
||||
.add(MI.getOperand(1)); // Saved EXEC
|
||||
|
@ -348,15 +346,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
|
|||
|
||||
MachineBasicBlock::iterator ElsePt(MI);
|
||||
|
||||
if (ExecModified) {
|
||||
MachineInstr *And =
|
||||
BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
|
||||
.addReg(Exec)
|
||||
.addReg(SaveReg);
|
||||
// This accounts for any modification of the EXEC mask within the block and
|
||||
// can be optimized out pre-RA when not required.
|
||||
MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
|
||||
.addReg(Exec)
|
||||
.addReg(SaveReg);
|
||||
|
||||
if (LIS)
|
||||
LIS->InsertMachineInstrInMaps(*And);
|
||||
}
|
||||
if (LIS)
|
||||
LIS->InsertMachineInstrInMaps(*And);
|
||||
|
||||
MachineInstr *Xor =
|
||||
BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
|
||||
|
@ -386,8 +383,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
|
|||
|
||||
LIS->removeInterval(DstReg);
|
||||
LIS->createAndComputeVirtRegInterval(DstReg);
|
||||
if (ExecModified)
|
||||
LIS->createAndComputeVirtRegInterval(SaveReg);
|
||||
LIS->createAndComputeVirtRegInterval(SaveReg);
|
||||
|
||||
// Let this be recomputed.
|
||||
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
|
||||
|
|
|
@ -35,10 +35,13 @@ private:
|
|||
|
||||
unsigned AndOpc;
|
||||
unsigned Andn2Opc;
|
||||
unsigned OrSaveExecOpc;
|
||||
unsigned XorTermrOpc;
|
||||
Register CondReg;
|
||||
Register ExecReg;
|
||||
|
||||
Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
|
||||
bool optimizeElseBranch(MachineBasicBlock &MBB);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
@ -224,6 +227,81 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
|
|||
return CCReg;
|
||||
}
|
||||
|
||||
// Optimize sequence
|
||||
// %dst = S_OR_SAVEEXEC %src
|
||||
// ... instructions not modifying exec ...
|
||||
// %tmp = S_AND $exec, %dst
|
||||
// $exec = S_XOR_term $exec, %tmp
|
||||
// =>
|
||||
// %dst = S_OR_SAVEEXEC %src
|
||||
// ... instructions not modifying exec ...
|
||||
// $exec = S_XOR_term $exec, %dst
|
||||
//
|
||||
// Clean up potentially unnecessary code added for safety during
|
||||
// control flow lowering.
|
||||
//
|
||||
// Return whether any changes were made to MBB.
|
||||
bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
|
||||
if (MBB.empty())
|
||||
return false;
|
||||
|
||||
// Check this is an else block.
|
||||
auto First = MBB.begin();
|
||||
MachineInstr &SaveExecMI = *First;
|
||||
if (SaveExecMI.getOpcode() != OrSaveExecOpc)
|
||||
return false;
|
||||
|
||||
auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
|
||||
return MI.getOpcode() == XorTermrOpc;
|
||||
});
|
||||
if (I == MBB.terminators().end())
|
||||
return false;
|
||||
|
||||
MachineInstr &XorTermMI = *I;
|
||||
if (XorTermMI.getOperand(1).getReg() != ExecReg)
|
||||
return false;
|
||||
|
||||
Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
|
||||
Register DstReg = XorTermMI.getOperand(2).getReg();
|
||||
|
||||
// Find potentially unnecessary S_AND
|
||||
MachineInstr *AndExecMI = nullptr;
|
||||
I--;
|
||||
while (I != First && !AndExecMI) {
|
||||
if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
|
||||
I->getOperand(1).getReg() == ExecReg)
|
||||
AndExecMI = &*I;
|
||||
I--;
|
||||
}
|
||||
if (!AndExecMI)
|
||||
return false;
|
||||
|
||||
// Check for exec modifying instructions.
|
||||
// Note: exec defs do not create live ranges beyond the
|
||||
// instruction so isDefBetween cannot be used.
|
||||
// Instead just check that the def segments are adjacent.
|
||||
SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
|
||||
SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
|
||||
for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
|
||||
LiveRange &RegUnit = LIS->getRegUnit(*UI);
|
||||
if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Remove unnecessary S_AND
|
||||
LIS->removeInterval(SavedExecReg);
|
||||
LIS->removeInterval(DstReg);
|
||||
|
||||
SaveExecMI.getOperand(0).setReg(DstReg);
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(*AndExecMI);
|
||||
AndExecMI->eraseFromParent();
|
||||
|
||||
LIS->createAndComputeVirtRegInterval(DstReg);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
|
||||
if (skipFunction(MF.getFunction()))
|
||||
return false;
|
||||
|
@ -237,6 +315,9 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
|
|||
const bool Wave32 = ST.isWave32();
|
||||
AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||
Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
|
||||
OrSaveExecOpc =
|
||||
Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
|
||||
XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
|
||||
CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
|
||||
ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
|
||||
|
@ -245,6 +326,11 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
|
|||
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
|
||||
if (optimizeElseBranch(MBB)) {
|
||||
RecalcRegs.insert(AMDGPU::SCC);
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
if (Register Reg = optimizeVcndVcmpPair(MBB)) {
|
||||
RecalcRegs.insert(Reg);
|
||||
RecalcRegs.insert(AMDGPU::VCC_LO);
|
||||
|
|
|
@ -730,9 +730,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
|
|||
if (MI.isTerminator() && OutNeeds == StateExact)
|
||||
Needs = StateExact;
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
|
||||
MI.getOperand(3).setImm(1);
|
||||
|
||||
++Next;
|
||||
} else {
|
||||
// End of basic block
|
||||
|
|
|
@ -140,7 +140,7 @@ body: |
|
|||
; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
|
||||
; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE64: G_BR %bb.1
|
||||
; WAVE64: bb.1:
|
||||
; WAVE32-LABEL: name: brcond_si_else
|
||||
|
@ -149,7 +149,7 @@ body: |
|
|||
; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
|
||||
; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE32: G_BR %bb.1
|
||||
; WAVE32: bb.1:
|
||||
bb.0:
|
||||
|
|
|
@ -475,13 +475,14 @@ body: |
|
|||
; GCN: bb.2:
|
||||
; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000)
|
||||
; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; GCN: $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc
|
||||
; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000)
|
||||
; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
|
||||
; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
|
||||
; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
|
||||
; GCN: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
|
||||
; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_2]]
|
||||
; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec
|
||||
; GCN: bb.4:
|
||||
; GCN: successors: %bb.5(0x80000000)
|
||||
|
@ -489,7 +490,7 @@ body: |
|
|||
; GCN: successors: %bb.6(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc
|
||||
; GCN: bb.6:
|
||||
; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; GCN: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc
|
||||
; GCN: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
|
@ -502,7 +503,7 @@ body: |
|
|||
|
||||
bb.2:
|
||||
successors: %bb.3, %bb.6
|
||||
%2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
%2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
|
||||
bb.3:
|
||||
successors: %bb.3, %bb.4
|
||||
|
|
|
@ -196,19 +196,20 @@ end:
|
|||
|
||||
; Regular spill value restored after exec modification
|
||||
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
|
||||
; Followed by spill
|
||||
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
|
||||
; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
|
||||
|
||||
; Spill saved exec
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
|
||||
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], 0
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], 1
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1
|
||||
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
|
||||
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
|
||||
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]
|
||||
|
||||
|
||||
|
|
|
@ -100,11 +100,12 @@ body: |
|
|||
; CHECK: bb.0:
|
||||
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5
|
||||
; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5
|
||||
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec
|
||||
; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec
|
||||
; CHECK: S_BRANCH %bb.2
|
||||
|
@ -120,7 +121,7 @@ body: |
|
|||
%0:vgpr_32 = COPY killed $vgpr0
|
||||
%1:sreg_64_xexec = COPY $sgpr4_sgpr5
|
||||
%2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
|
||||
%3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec
|
||||
%3:sreg_64_xexec = SI_ELSE %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
|
||||
%4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
|
|
Loading…
Reference in New Issue