[AMDGPU] Remove fix up operand from SI_ELSE

Remove immediate operand from SI_ELSE which indicates if EXEC has
been modified.  Instead always emit code that handles EXEC and
remove unnecessary instructions during pre-RA optimisation.

This facilitates passes (i.e. SIWholeQuadMode) adding exec mask
manipulation post control flow lowering, and pre control flow
lower passes do not need to be aware of SI_ELSE handling.

Reviewed By: nhaehnle

Differential Revision: https://reviews.llvm.org/D89644
This commit is contained in:
Carl Ritson 2020-10-20 18:11:51 +09:00
parent d99b2a976a
commit be2afbd019
9 changed files with 120 additions and 39 deletions

View File

@ -4625,10 +4625,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
.addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
.addDef(Def)
.addUse(Use)
.addMBB(UncondBrTarget)
.addImm(0);
.addDef(Def)
.addUse(Use)
.addMBB(UncondBrTarget);
}
if (Br) {

View File

@ -325,7 +325,7 @@ def SI_IF: CFPseudoInstSI <
def SI_ELSE : CFPseudoInstSI <
(outs SReg_1:$dst),
(ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
(ins SReg_1:$src, brtarget:$target), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
@ -745,7 +745,7 @@ def : GCNPat<
def : GCNPat<
(AMDGPUelse i1:$src, bb:$target),
(SI_ELSE $src, $target, 0)
(SI_ELSE $src, $target)
>;
def : Pat <

View File

@ -333,13 +333,11 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
// This must be inserted before phis and any spill code inserted before the
// else.
Register SaveReg = ExecModified ?
MRI->createVirtualRegister(BoolRC) : DstReg;
Register SaveReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *OrSaveExec =
BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
.add(MI.getOperand(1)); // Saved EXEC
@ -348,15 +346,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock::iterator ElsePt(MI);
if (ExecModified) {
MachineInstr *And =
BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
.addReg(Exec)
.addReg(SaveReg);
// This accounts for any modification of the EXEC mask within the block and
// can be optimized out pre-RA when not required.
MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
.addReg(Exec)
.addReg(SaveReg);
if (LIS)
LIS->InsertMachineInstrInMaps(*And);
}
if (LIS)
LIS->InsertMachineInstrInMaps(*And);
MachineInstr *Xor =
BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
@ -386,8 +383,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
LIS->removeInterval(DstReg);
LIS->createAndComputeVirtRegInterval(DstReg);
if (ExecModified)
LIS->createAndComputeVirtRegInterval(SaveReg);
LIS->createAndComputeVirtRegInterval(SaveReg);
// Let this be recomputed.
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);

View File

@ -35,10 +35,13 @@ private:
unsigned AndOpc;
unsigned Andn2Opc;
unsigned OrSaveExecOpc;
unsigned XorTermrOpc;
Register CondReg;
Register ExecReg;
Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
bool optimizeElseBranch(MachineBasicBlock &MBB);
public:
static char ID;
@ -224,6 +227,81 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
return CCReg;
}
// Optimize sequence
// %dst = S_OR_SAVEEXEC %src
// ... instructions not modifying exec ...
// %tmp = S_AND $exec, %dst
// $exec = S_XOR_term $exec, %tmp
// =>
// %dst = S_OR_SAVEEXEC %src
// ... instructions not modifying exec ...
// $exec = S_XOR_term $exec, %dst
//
// Clean up potentially unnecessary code added for safety during
// control flow lowering.
//
// Return whether any changes were made to MBB.
bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
if (MBB.empty())
return false;
// Check this is an else block.
auto First = MBB.begin();
MachineInstr &SaveExecMI = *First;
if (SaveExecMI.getOpcode() != OrSaveExecOpc)
return false;
auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
return MI.getOpcode() == XorTermrOpc;
});
if (I == MBB.terminators().end())
return false;
MachineInstr &XorTermMI = *I;
if (XorTermMI.getOperand(1).getReg() != ExecReg)
return false;
Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
Register DstReg = XorTermMI.getOperand(2).getReg();
// Find potentially unnecessary S_AND
MachineInstr *AndExecMI = nullptr;
I--;
while (I != First && !AndExecMI) {
if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
I->getOperand(1).getReg() == ExecReg)
AndExecMI = &*I;
I--;
}
if (!AndExecMI)
return false;
// Check for exec modifying instructions.
// Note: exec defs do not create live ranges beyond the
// instruction so isDefBetween cannot be used.
// Instead just check that the def segments are adjacent.
SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
LiveRange &RegUnit = LIS->getRegUnit(*UI);
if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
return false;
}
// Remove unnecessary S_AND
LIS->removeInterval(SavedExecReg);
LIS->removeInterval(DstReg);
SaveExecMI.getOperand(0).setReg(DstReg);
LIS->RemoveMachineInstrFromMaps(*AndExecMI);
AndExecMI->eraseFromParent();
LIS->createAndComputeVirtRegInterval(DstReg);
return true;
}
bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@ -237,6 +315,9 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
const bool Wave32 = ST.isWave32();
AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
OrSaveExecOpc =
Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@ -245,6 +326,11 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
if (optimizeElseBranch(MBB)) {
RecalcRegs.insert(AMDGPU::SCC);
Changed = true;
}
if (Register Reg = optimizeVcndVcmpPair(MBB)) {
RecalcRegs.insert(Reg);
RecalcRegs.insert(AMDGPU::VCC_LO);

View File

@ -730,9 +730,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;
if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);
++Next;
} else {
// End of basic block

View File

@ -140,7 +140,7 @@ body: |
; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.1
; WAVE64: bb.1:
; WAVE32-LABEL: name: brcond_si_else
@ -149,7 +149,7 @@ body: |
; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.1
; WAVE32: bb.1:
bb.0:

View File

@ -475,13 +475,14 @@ body: |
; GCN: bb.2:
; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000)
; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; GCN: $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc
; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec
; GCN: bb.3:
; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
; GCN: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc
; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_2]]
; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN: bb.4:
; GCN: successors: %bb.5(0x80000000)
@ -489,7 +490,7 @@ body: |
; GCN: successors: %bb.6(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc
; GCN: bb.6:
; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; GCN: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc
; GCN: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
@ -502,7 +503,7 @@ body: |
bb.2:
successors: %bb.3, %bb.6
%2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
%2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
bb.3:
successors: %bb.3, %bb.4

View File

@ -196,19 +196,20 @@ end:
; Regular spill value restored after exec modification
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; Followed by spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], 0
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], 1
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}}
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]

View File

@ -100,11 +100,12 @@ body: |
; CHECK: bb.0:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5
; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec
; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec
; CHECK: S_BRANCH %bb.2
@ -120,7 +121,7 @@ body: |
%0:vgpr_32 = COPY killed $vgpr0
%1:sreg_64_xexec = COPY $sgpr4_sgpr5
%2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
%3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec
%3:sreg_64_xexec = SI_ELSE %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
%4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
S_BRANCH %bb.2