2012-12-12 05:25:42 +08:00
|
|
|
//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
2012-12-20 06:10:31 +08:00
|
|
|
/// \brief This pass lowers the pseudo control flow instructions to real
|
|
|
|
/// machine instructions.
|
2012-12-12 05:25:42 +08:00
|
|
|
///
|
2012-12-20 06:10:31 +08:00
|
|
|
/// All control flow is handled using predicated instructions and
|
2012-12-12 05:25:42 +08:00
|
|
|
/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
|
|
|
|
/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
|
|
|
|
/// by writting to the 64-bit EXEC register (each bit corresponds to a
|
|
|
|
/// single vector ALU). Typically, for predicates, a vector ALU will write
|
|
|
|
/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
|
|
|
|
/// Vector ALU) and then the ScalarALU will AND the VCC register with the
|
|
|
|
/// EXEC to update the predicates.
|
|
|
|
///
|
|
|
|
/// For example:
|
|
|
|
/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
|
2012-12-20 06:10:31 +08:00
|
|
|
/// %SGPR0 = SI_IF %VCC
|
2012-12-12 05:25:42 +08:00
|
|
|
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
|
2012-12-20 06:10:31 +08:00
|
|
|
/// %SGPR0 = SI_ELSE %SGPR0
|
2012-12-12 05:25:42 +08:00
|
|
|
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
|
2012-12-20 06:10:31 +08:00
|
|
|
/// SI_END_CF %SGPR0
|
2012-12-12 05:25:42 +08:00
|
|
|
///
|
|
|
|
/// becomes:
|
|
|
|
///
|
|
|
|
/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
|
|
|
|
/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
|
2012-12-20 06:10:31 +08:00
|
|
|
/// S_CBRANCH_EXECZ label0 // This instruction is an optional
|
2012-12-12 05:25:42 +08:00
|
|
|
/// // optimization which allows us to
|
|
|
|
/// // branch if all the bits of
|
|
|
|
/// // EXEC are zero.
|
|
|
|
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
|
|
|
|
///
|
|
|
|
/// label0:
|
|
|
|
/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
|
|
|
|
/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
|
|
|
|
/// S_BRANCH_EXECZ label1 // Use our branch optimization
|
|
|
|
/// // instruction again.
|
|
|
|
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
|
|
|
|
/// label1:
|
2012-12-20 06:10:31 +08:00
|
|
|
/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
|
2012-12-12 05:25:42 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
2014-08-05 05:25:23 +08:00
|
|
|
#include "AMDGPUSubtarget.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
2016-06-23 07:40:57 +08:00
|
|
|
#include "llvm/CodeGen/LivePhysRegs.h"
|
2014-09-15 23:41:53 +08:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2014-02-27 09:47:09 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
2016-07-28 00:03:57 +08:00
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
2012-12-12 05:25:42 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
#define DEBUG_TYPE "si-lower-control-flow"
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
namespace {
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-07-26 03:48:29 +08:00
|
|
|
static cl::opt<unsigned> SkipThresholdFlag(
|
|
|
|
"amdgpu-skip-threshold",
|
|
|
|
cl::desc("Number of instructions before jumping over divergent control flow"),
|
|
|
|
cl::init(12), cl::Hidden);
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
class SILowerControlFlow : public MachineFunctionPass {
|
2012-12-12 05:25:42 +08:00
|
|
|
private:
|
2014-04-30 23:31:33 +08:00
|
|
|
const SIRegisterInfo *TRI;
|
2014-02-11 00:58:30 +08:00
|
|
|
const SIInstrInfo *TII;
|
2016-07-26 03:48:29 +08:00
|
|
|
unsigned SkipThreshold;
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2013-01-19 05:15:50 +08:00
|
|
|
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
|
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
MachineInstr *Skip(MachineInstr &From, MachineOperand &To);
|
2016-07-13 05:41:32 +08:00
|
|
|
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
void If(MachineInstr &MI);
|
AMDGPU: add execfix flag to SI_ELSE
Summary:
SI_ELSE is lowered into two parts:
s_or_saveexec_b64 dst, src (at the start of the basic block)
s_xor_b64 exec, exec, dst (at the end of the basic block)
The idea is that dst contains the exec mask of the preceding IF block. It can
happen that SIWholeQuadMode decides to switch from WQM to Exact mode inside
the basic block that contains SI_ELSE, in which case it introduces an instruction
s_and_b64 exec, exec, s[...]
which masks out bits that can correspond to both the IF and the ELSE paths.
So the resulting sequence must be:
s_or_savexec_b64 dst, src
s_and_b64 exec, exec, s[...] <-- added by SIWholeQuadMode
s_and_b64 dst, dst, exec <-- added by SILowerControlFlow
s_xor_b64 exec, exec, dst
Whether to add the additional s_and_b64 dst, dst, exec is currently determined
via the ExecModified tracking. With this change, it is instead determined by
an additional flag on SI_ELSE which is set by SIWholeQuadMode.
Finally: It also occured to me that an alternative approach for the long run
is for SILowerControlFlow to unconditionally emit
s_or_saveexec_b64 dst, src
...
s_and_b64 dst, dst, exec
s_xor_b64 exec, exec, dst
and have a pass that detects and cleans up the "redundant AND with exec"
pattern where possible. This could be useful anyway, because we also add
instructions
s_and_b64 vcc, exec, vcc
before s_cbranch_scc (in moveToALU), and those are often redundant. I have
some pending changes to how KILL is lowered that could also benefit from
such a cleanup pass.
In any case, this current patch could help in the short term with the whole
ExecModified business.
Reviewers: tstellarAMD, arsenm
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: https://reviews.llvm.org/D22846
llvm-svn: 276972
2016-07-28 19:39:24 +08:00
|
|
|
void Else(MachineInstr &MI);
|
2016-07-10 01:18:39 +08:00
|
|
|
void Break(MachineInstr &MI);
|
2012-12-20 06:10:31 +08:00
|
|
|
void IfBreak(MachineInstr &MI);
|
|
|
|
void ElseBreak(MachineInstr &MI);
|
|
|
|
void Loop(MachineInstr &MI);
|
|
|
|
void EndCf(MachineInstr &MI);
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2013-01-19 05:15:50 +08:00
|
|
|
void Kill(MachineInstr &MI);
|
2012-12-20 06:10:33 +08:00
|
|
|
void Branch(MachineInstr &MI);
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I) const;
|
2012-12-12 05:25:42 +08:00
|
|
|
public:
|
2016-02-12 10:16:10 +08:00
|
|
|
static char ID;
|
|
|
|
|
|
|
|
SILowerControlFlow() :
|
2016-07-26 03:48:29 +08:00
|
|
|
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2014-04-29 15:57:24 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2014-04-29 15:57:24 +08:00
|
|
|
const char *getPassName() const override {
|
2016-02-12 10:16:10 +08:00
|
|
|
return "SI Lower control flow pseudo instructions";
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
} // End anonymous namespace
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
char SILowerControlFlow::ID = 0;
|
|
|
|
|
|
|
|
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
|
|
|
|
"SI lower control flow", false, false)
|
|
|
|
|
|
|
|
char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
|
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
FunctionPass *llvm::createSILowerControlFlowPass() {
|
|
|
|
return new SILowerControlFlow();
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
|
2016-04-30 05:52:13 +08:00
|
|
|
static bool opcodeEmitsNoInsts(unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case TargetOpcode::IMPLICIT_DEF:
|
|
|
|
case TargetOpcode::KILL:
|
|
|
|
case TargetOpcode::BUNDLE:
|
|
|
|
case TargetOpcode::CFI_INSTRUCTION:
|
|
|
|
case TargetOpcode::EH_LABEL:
|
|
|
|
case TargetOpcode::GC_LABEL:
|
|
|
|
case TargetOpcode::DBG_VALUE:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
|
|
|
|
MachineBasicBlock *To) {
|
2016-07-15 08:58:13 +08:00
|
|
|
if (From->succ_empty())
|
|
|
|
return false;
|
2013-01-19 05:15:50 +08:00
|
|
|
|
2012-12-20 06:10:33 +08:00
|
|
|
unsigned NumInstr = 0;
|
2016-04-30 05:52:13 +08:00
|
|
|
MachineFunction *MF = From->getParent();
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-04-30 05:52:13 +08:00
|
|
|
for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
|
|
|
|
MBBI != End && MBBI != ToI; ++MBBI) {
|
2016-03-22 02:56:58 +08:00
|
|
|
MachineBasicBlock &MBB = *MBBI;
|
|
|
|
|
|
|
|
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
2012-12-20 06:10:33 +08:00
|
|
|
NumInstr < SkipThreshold && I != E; ++I) {
|
2016-04-30 05:52:13 +08:00
|
|
|
if (opcodeEmitsNoInsts(I->getOpcode()))
|
|
|
|
continue;
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-04-30 05:52:13 +08:00
|
|
|
// When a uniform loop is inside non-uniform control flow, the branch
|
|
|
|
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
|
|
|
|
// when EXEC = 0. We should skip the loop lest it becomes infinite.
|
2016-05-20 02:20:25 +08:00
|
|
|
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
|
|
|
|
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
|
2016-04-30 05:52:13 +08:00
|
|
|
return true;
|
2016-03-17 04:14:33 +08:00
|
|
|
|
2016-07-13 03:01:23 +08:00
|
|
|
if (I->isInlineAsm()) {
|
|
|
|
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
|
|
|
|
const char *AsmStr = I->getOperand(0).getSymbolName();
|
|
|
|
|
|
|
|
// inlineasm length estimate is number of bytes assuming the longest
|
|
|
|
// instruction.
|
|
|
|
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
|
|
|
|
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
|
|
|
|
} else {
|
|
|
|
++NumInstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NumInstr >= SkipThreshold)
|
2016-04-30 05:52:13 +08:00
|
|
|
return true;
|
2012-12-20 06:10:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-19 05:15:50 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
|
2013-01-19 05:15:50 +08:00
|
|
|
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
|
2016-08-11 03:11:42 +08:00
|
|
|
return nullptr;
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
const DebugLoc &DL = From.getDebugLoc();
|
|
|
|
MachineInstr *Skip =
|
|
|
|
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
2015-08-06 00:42:57 +08:00
|
|
|
.addOperand(To);
|
2016-08-11 03:11:42 +08:00
|
|
|
return Skip;
|
2012-12-20 06:10:33 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
|
2013-01-19 05:15:50 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineFunction *MF = MBB.getParent();
|
2013-01-19 05:15:50 +08:00
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
|
2014-02-27 09:47:02 +08:00
|
|
|
!shouldSkip(&MBB, &MBB.getParent()->back()))
|
2016-07-13 03:01:23 +08:00
|
|
|
return false;
|
2013-01-19 05:15:50 +08:00
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
|
2016-07-15 08:58:13 +08:00
|
|
|
MBB.addSuccessor(SkipBB);
|
2016-07-13 03:01:23 +08:00
|
|
|
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
2013-01-19 05:15:50 +08:00
|
|
|
|
|
|
|
// If the exec mask is non-zero, skip the next two instructions
|
2016-07-13 03:01:23 +08:00
|
|
|
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
2016-07-13 05:41:32 +08:00
|
|
|
.addMBB(&NextBB);
|
2016-07-13 03:01:23 +08:00
|
|
|
|
|
|
|
MachineBasicBlock::iterator Insert = SkipBB->begin();
|
2013-01-19 05:15:50 +08:00
|
|
|
|
|
|
|
// Exec mask is zero: Export to NULL target...
|
2016-07-13 03:01:23 +08:00
|
|
|
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
|
|
|
|
.addImm(0)
|
|
|
|
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
|
|
|
.addImm(0)
|
|
|
|
.addImm(1)
|
|
|
|
.addImm(1)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef);
|
|
|
|
|
|
|
|
// ... and terminate wavefront.
|
|
|
|
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
|
|
|
|
|
|
|
|
return true;
|
2013-01-19 05:15:50 +08:00
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::If(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
unsigned Reg = MI.getOperand(0).getReg();
|
|
|
|
unsigned Vcc = MI.getOperand(1).getReg();
|
|
|
|
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
|
|
|
|
.addReg(Vcc);
|
|
|
|
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Reg);
|
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
|
|
|
|
|
|
|
|
// Insert before the new branch instruction.
|
|
|
|
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-06-23 04:15:28 +08:00
|
|
|
// Insert a pseudo terminator to help keep the verifier happy.
|
2016-08-11 03:11:42 +08:00
|
|
|
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
2016-07-08 08:55:44 +08:00
|
|
|
.addOperand(MI.getOperand(2))
|
|
|
|
.addReg(Reg);
|
2016-06-23 04:15:28 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
AMDGPU: add execfix flag to SI_ELSE
Summary:
SI_ELSE is lowered into two parts:
s_or_saveexec_b64 dst, src (at the start of the basic block)
s_xor_b64 exec, exec, dst (at the end of the basic block)
The idea is that dst contains the exec mask of the preceding IF block. It can
happen that SIWholeQuadMode decides to switch from WQM to Exact mode inside
the basic block that contains SI_ELSE, in which case it introduces an instruction
s_and_b64 exec, exec, s[...]
which masks out bits that can correspond to both the IF and the ELSE paths.
So the resulting sequence must be:
s_or_savexec_b64 dst, src
s_and_b64 exec, exec, s[...] <-- added by SIWholeQuadMode
s_and_b64 dst, dst, exec <-- added by SILowerControlFlow
s_xor_b64 exec, exec, dst
Whether to add the additional s_and_b64 dst, dst, exec is currently determined
via the ExecModified tracking. With this change, it is instead determined by
an additional flag on SI_ELSE which is set by SIWholeQuadMode.
Finally: It also occured to me that an alternative approach for the long run
is for SILowerControlFlow to unconditionally emit
s_or_saveexec_b64 dst, src
...
s_and_b64 dst, dst, exec
s_xor_b64 exec, exec, dst
and have a pass that detects and cleans up the "redundant AND with exec"
pattern where possible. This could be useful anyway, because we also add
instructions
s_and_b64 vcc, exec, vcc
before s_cbranch_scc (in moveToALU), and those are often redundant. I have
some pending changes to how KILL is lowered that could also benefit from
such a cleanup pass.
In any case, this current patch could help in the short term with the whole
ExecModified business.
Reviewers: tstellarAMD, arsenm
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: https://reviews.llvm.org/D22846
llvm-svn: 276972
2016-07-28 19:39:24 +08:00
|
|
|
void SILowerControlFlow::Else(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Src = MI.getOperand(1).getReg();
|
|
|
|
|
2013-03-26 22:03:44 +08:00
|
|
|
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
|
|
|
|
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
|
2012-12-20 06:10:31 +08:00
|
|
|
.addReg(Src); // Saved EXEC
|
|
|
|
|
AMDGPU: add execfix flag to SI_ELSE
Summary:
SI_ELSE is lowered into two parts:
s_or_saveexec_b64 dst, src (at the start of the basic block)
s_xor_b64 exec, exec, dst (at the end of the basic block)
The idea is that dst contains the exec mask of the preceding IF block. It can
happen that SIWholeQuadMode decides to switch from WQM to Exact mode inside
the basic block that contains SI_ELSE, in which case it introduces an instruction
s_and_b64 exec, exec, s[...]
which masks out bits that can correspond to both the IF and the ELSE paths.
So the resulting sequence must be:
s_or_savexec_b64 dst, src
s_and_b64 exec, exec, s[...] <-- added by SIWholeQuadMode
s_and_b64 dst, dst, exec <-- added by SILowerControlFlow
s_xor_b64 exec, exec, dst
Whether to add the additional s_and_b64 dst, dst, exec is currently determined
via the ExecModified tracking. With this change, it is instead determined by
an additional flag on SI_ELSE which is set by SIWholeQuadMode.
Finally: It also occured to me that an alternative approach for the long run
is for SILowerControlFlow to unconditionally emit
s_or_saveexec_b64 dst, src
...
s_and_b64 dst, dst, exec
s_xor_b64 exec, exec, dst
and have a pass that detects and cleans up the "redundant AND with exec"
pattern where possible. This could be useful anyway, because we also add
instructions
s_and_b64 vcc, exec, vcc
before s_cbranch_scc (in moveToALU), and those are often redundant. I have
some pending changes to how KILL is lowered that could also benefit from
such a cleanup pass.
In any case, this current patch could help in the short term with the whole
ExecModified business.
Reviewers: tstellarAMD, arsenm
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: https://reviews.llvm.org/D22846
llvm-svn: 276972
2016-07-28 19:39:24 +08:00
|
|
|
if (MI.getOperand(3).getImm() != 0) {
|
2016-03-22 04:28:33 +08:00
|
|
|
// Adjust the saved exec to account for the modifications during the flow
|
|
|
|
// block that contains the ELSE. This can happen when WQM mode is switched
|
|
|
|
// off.
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Dst);
|
|
|
|
}
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Dst);
|
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
|
|
|
|
|
|
|
|
// Insert before the new branch instruction.
|
|
|
|
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-06-23 04:15:28 +08:00
|
|
|
// Insert a pseudo terminator to help keep the verifier happy.
|
2016-08-11 03:11:42 +08:00
|
|
|
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
2016-07-08 08:55:44 +08:00
|
|
|
.addOperand(MI.getOperand(2))
|
|
|
|
.addReg(Dst);
|
2016-06-23 04:15:28 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-07-10 01:18:39 +08:00
|
|
|
void SILowerControlFlow::Break(MachineInstr &MI) {
|
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Src = MI.getOperand(1).getReg();
|
|
|
|
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Src);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::IfBreak(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Vcc = MI.getOperand(1).getReg();
|
|
|
|
unsigned Src = MI.getOperand(2).getReg();
|
2016-02-12 10:16:07 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
|
|
.addReg(Vcc)
|
|
|
|
.addReg(Src);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
unsigned Dst = MI.getOperand(0).getReg();
|
|
|
|
unsigned Saved = MI.getOperand(1).getReg();
|
|
|
|
unsigned Src = MI.getOperand(2).getReg();
|
2016-02-12 10:16:07 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
|
|
.addReg(Saved)
|
|
|
|
.addReg(Src);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::Loop(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
unsigned Src = MI.getOperand(0).getReg();
|
|
|
|
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Src);
|
|
|
|
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
2015-08-06 00:42:57 +08:00
|
|
|
.addOperand(MI.getOperand(1));
|
2012-12-20 06:10:31 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::EndCf(MachineInstr &MI) {
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
unsigned Reg = MI.getOperand(0).getReg();
|
|
|
|
|
|
|
|
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
|
|
|
|
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
|
|
|
|
.addReg(AMDGPU::EXEC)
|
|
|
|
.addReg(Reg);
|
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::Branch(MachineInstr &MI) {
|
2016-06-23 04:15:28 +08:00
|
|
|
MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
|
|
|
|
if (MBB == MI.getParent()->getNextNode())
|
2014-02-12 05:12:38 +08:00
|
|
|
MI.eraseFromParent();
|
|
|
|
|
|
|
|
// If these aren't equal, this is probably an infinite loop.
|
2012-12-20 06:10:33 +08:00
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
void SILowerControlFlow::Kill(MachineInstr &MI) {
|
2013-01-19 05:15:50 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
2014-02-27 09:47:09 +08:00
|
|
|
const MachineOperand &Op = MI.getOperand(0);
|
2013-01-19 05:15:50 +08:00
|
|
|
|
2014-07-13 11:06:39 +08:00
|
|
|
#ifndef NDEBUG
|
2016-04-07 03:40:20 +08:00
|
|
|
CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
|
2014-07-13 11:06:39 +08:00
|
|
|
// Kill is only allowed in pixel / geometry shaders.
|
2016-04-07 03:40:20 +08:00
|
|
|
assert(CallConv == CallingConv::AMDGPU_PS ||
|
|
|
|
CallConv == CallingConv::AMDGPU_GS);
|
2014-07-13 11:06:39 +08:00
|
|
|
#endif
|
2013-01-19 05:15:50 +08:00
|
|
|
|
2014-02-27 09:47:09 +08:00
|
|
|
// Clear this thread from the exec mask if the operand is negative
|
2015-01-14 06:59:41 +08:00
|
|
|
if ((Op.isImm())) {
|
2014-02-27 09:47:09 +08:00
|
|
|
// Constant operand: Set exec mask to 0 or do nothing
|
2015-01-14 06:59:41 +08:00
|
|
|
if (Op.getImm() & 0x80000000) {
|
2014-02-27 09:47:09 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
|
|
|
.addImm(0);
|
|
|
|
}
|
|
|
|
} else {
|
2015-08-08 08:41:48 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
|
2014-02-27 09:47:09 +08:00
|
|
|
.addImm(0)
|
|
|
|
.addOperand(Op);
|
|
|
|
}
|
2013-01-19 05:15:50 +08:00
|
|
|
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
|
|
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
|
|
|
|
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
|
|
|
|
MachineFunction::iterator MBBI(MBB);
|
|
|
|
++MBBI;
|
|
|
|
|
|
|
|
MF->insert(MBBI, SkipBB);
|
|
|
|
|
|
|
|
return SkipBB;
|
|
|
|
}
|
|
|
|
|
2016-02-12 10:16:10 +08:00
|
|
|
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
2016-06-24 14:30:11 +08:00
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
|
|
|
TII = ST.getInstrInfo();
|
|
|
|
TRI = &TII->getRegisterInfo();
|
2016-07-26 03:48:29 +08:00
|
|
|
SkipThreshold = SkipThresholdFlag;
|
2016-06-24 14:30:11 +08:00
|
|
|
|
2013-01-19 05:15:50 +08:00
|
|
|
bool HaveKill = false;
|
|
|
|
unsigned Depth = 0;
|
2012-12-12 05:25:42 +08:00
|
|
|
|
2016-06-23 04:15:28 +08:00
|
|
|
MachineFunction::iterator NextBB;
|
2012-12-20 06:10:31 +08:00
|
|
|
|
2016-06-23 04:15:28 +08:00
|
|
|
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
|
|
|
BI != BE; BI = NextBB) {
|
|
|
|
NextBB = std::next(BI);
|
2012-12-20 06:10:31 +08:00
|
|
|
MachineBasicBlock &MBB = *BI;
|
2016-06-23 04:15:28 +08:00
|
|
|
|
|
|
|
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
|
2014-03-28 21:52:56 +08:00
|
|
|
MachineBasicBlock::iterator I, Next;
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2014-03-28 21:52:56 +08:00
|
|
|
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
2014-03-02 20:27:27 +08:00
|
|
|
Next = std::next(I);
|
2014-03-28 21:52:56 +08:00
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
MachineInstr &MI = *I;
|
2014-02-11 00:58:30 +08:00
|
|
|
|
2012-12-12 05:25:42 +08:00
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
default: break;
|
2012-12-20 06:10:31 +08:00
|
|
|
case AMDGPU::SI_IF:
|
2013-01-19 05:15:50 +08:00
|
|
|
++Depth;
|
2012-12-20 06:10:31 +08:00
|
|
|
If(MI);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AMDGPU::SI_ELSE:
|
AMDGPU: add execfix flag to SI_ELSE
Summary:
SI_ELSE is lowered into two parts:
s_or_saveexec_b64 dst, src (at the start of the basic block)
s_xor_b64 exec, exec, dst (at the end of the basic block)
The idea is that dst contains the exec mask of the preceding IF block. It can
happen that SIWholeQuadMode decides to switch from WQM to Exact mode inside
the basic block that contains SI_ELSE, in which case it introduces an instruction
s_and_b64 exec, exec, s[...]
which masks out bits that can correspond to both the IF and the ELSE paths.
So the resulting sequence must be:
s_or_savexec_b64 dst, src
s_and_b64 exec, exec, s[...] <-- added by SIWholeQuadMode
s_and_b64 dst, dst, exec <-- added by SILowerControlFlow
s_xor_b64 exec, exec, dst
Whether to add the additional s_and_b64 dst, dst, exec is currently determined
via the ExecModified tracking. With this change, it is instead determined by
an additional flag on SI_ELSE which is set by SIWholeQuadMode.
Finally: It also occured to me that an alternative approach for the long run
is for SILowerControlFlow to unconditionally emit
s_or_saveexec_b64 dst, src
...
s_and_b64 dst, dst, exec
s_xor_b64 exec, exec, dst
and have a pass that detects and cleans up the "redundant AND with exec"
pattern where possible. This could be useful anyway, because we also add
instructions
s_and_b64 vcc, exec, vcc
before s_cbranch_scc (in moveToALU), and those are often redundant. I have
some pending changes to how KILL is lowered that could also benefit from
such a cleanup pass.
In any case, this current patch could help in the short term with the whole
ExecModified business.
Reviewers: tstellarAMD, arsenm
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: https://reviews.llvm.org/D22846
llvm-svn: 276972
2016-07-28 19:39:24 +08:00
|
|
|
Else(MI);
|
2012-12-20 06:10:31 +08:00
|
|
|
break;
|
|
|
|
|
2016-07-10 01:18:39 +08:00
|
|
|
case AMDGPU::SI_BREAK:
|
|
|
|
Break(MI);
|
|
|
|
break;
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
case AMDGPU::SI_IF_BREAK:
|
|
|
|
IfBreak(MI);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AMDGPU::SI_ELSE_BREAK:
|
|
|
|
ElseBreak(MI);
|
2012-12-12 05:25:42 +08:00
|
|
|
break;
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
case AMDGPU::SI_LOOP:
|
2013-01-19 05:15:50 +08:00
|
|
|
++Depth;
|
2012-12-20 06:10:31 +08:00
|
|
|
Loop(MI);
|
2012-12-12 05:25:42 +08:00
|
|
|
break;
|
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
case AMDGPU::SI_END_CF:
|
2013-01-19 05:15:50 +08:00
|
|
|
if (--Depth == 0 && HaveKill) {
|
|
|
|
HaveKill = false;
|
2016-07-15 08:58:15 +08:00
|
|
|
// TODO: Insert skip if exec is 0?
|
2013-01-19 05:15:50 +08:00
|
|
|
}
|
2016-07-15 08:58:15 +08:00
|
|
|
|
2012-12-20 06:10:31 +08:00
|
|
|
EndCf(MI);
|
2012-12-12 05:25:42 +08:00
|
|
|
break;
|
2012-12-20 06:10:33 +08:00
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
case AMDGPU::SI_KILL_TERMINATOR:
|
2016-07-13 03:01:23 +08:00
|
|
|
if (Depth == 0) {
|
2016-07-13 05:41:32 +08:00
|
|
|
if (skipIfDead(MI, *NextBB)) {
|
2016-07-13 03:01:23 +08:00
|
|
|
NextBB = std::next(BI);
|
|
|
|
BE = MF.end();
|
|
|
|
}
|
|
|
|
} else
|
2013-01-19 05:15:50 +08:00
|
|
|
HaveKill = true;
|
|
|
|
Kill(MI);
|
|
|
|
break;
|
|
|
|
|
2012-12-20 06:10:33 +08:00
|
|
|
case AMDGPU::S_BRANCH:
|
|
|
|
Branch(MI);
|
|
|
|
break;
|
2013-03-18 19:34:16 +08:00
|
|
|
|
2016-07-06 16:35:17 +08:00
|
|
|
case AMDGPU::SI_RETURN: {
|
|
|
|
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
|
2016-03-14 23:57:14 +08:00
|
|
|
|
|
|
|
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
|
|
|
|
// because external bytecode will be appended at the end.
|
|
|
|
if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
|
2016-07-06 16:35:17 +08:00
|
|
|
// SI_RETURN is not the last instruction. Add an empty block at
|
2016-03-14 23:57:14 +08:00
|
|
|
// the end and jump there.
|
|
|
|
if (!EmptyMBBAtEnd) {
|
|
|
|
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
|
|
|
|
MF.insert(MF.end(), EmptyMBBAtEnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
MBB.addSuccessor(EmptyMBBAtEnd);
|
|
|
|
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
|
|
|
|
.addMBB(EmptyMBBAtEnd);
|
2016-07-06 16:35:17 +08:00
|
|
|
I->eraseFromParent();
|
2016-03-14 23:57:14 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-12-20 06:10:31 +08:00
|
|
|
return true;
|
2012-12-12 05:25:42 +08:00
|
|
|
}
|