forked from OSchip/llvm-project
[AMDGPU] Introduce Strict WQM mode
* Add amdgcn_strict_wqm intrinsic. * Add a corresponding STRICT_WQM machine instruction. * The semantic is similar to amdgcn_strict_wwm with a notable difference that not all threads will be forcibly enabled during the computations of the intrinsic's argument, but only all threads in quads that have at least one thread active. * The difference between amdgc_wqm and amdgcn_strict_wqm, is that in the strict mode an inactive lane will always be enabled irrespective of control flow decisions. Reviewed By: critson Differential Revision: https://reviews.llvm.org/D96258
This commit is contained in:
parent
5d613e42d3
commit
4672bac177
|
@ -1621,6 +1621,10 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
|
|||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
|
||||
IntrConvergent, IntrWillReturn]
|
||||
>;
|
||||
def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
|
||||
IntrConvergent, IntrWillReturn]
|
||||
>;
|
||||
|
||||
// Given a value, copies it while setting all the inactive lanes to a given
|
||||
// value. Note that OpenGL helper lanes are considered active, so if the
|
||||
|
|
|
@ -2645,6 +2645,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
|
|||
case Intrinsic::amdgcn_strict_wwm:
|
||||
Opcode = AMDGPU::STRICT_WWM;
|
||||
break;
|
||||
case Intrinsic::amdgcn_strict_wqm:
|
||||
Opcode = AMDGPU::STRICT_WQM;
|
||||
break;
|
||||
case Intrinsic::amdgcn_interp_p1_f16:
|
||||
SelectInterpP1F16(N);
|
||||
return;
|
||||
|
|
|
@ -930,6 +930,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
|
|||
case Intrinsic::amdgcn_strict_wwm:
|
||||
case Intrinsic::amdgcn_wwm:
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
|
||||
case Intrinsic::amdgcn_strict_wqm:
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
|
||||
case Intrinsic::amdgcn_writelane:
|
||||
return selectWritelane(I);
|
||||
case Intrinsic::amdgcn_div_scale:
|
||||
|
|
|
@ -3958,6 +3958,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
case Intrinsic::amdgcn_mov_dpp:
|
||||
case Intrinsic::amdgcn_strict_wwm:
|
||||
case Intrinsic::amdgcn_wwm:
|
||||
case Intrinsic::amdgcn_strict_wqm:
|
||||
case Intrinsic::amdgcn_wqm:
|
||||
case Intrinsic::amdgcn_softwqm:
|
||||
case Intrinsic::amdgcn_set_inactive:
|
||||
|
|
|
@ -581,6 +581,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
|||
continue;
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::STRICT_WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::STRICT_WWM: {
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
|
|
|
@ -1949,9 +1949,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
|
|||
: AMDGPU::S_OR_SAVEEXEC_B64));
|
||||
break;
|
||||
}
|
||||
case AMDGPU::EXIT_STRICT_WWM: {
|
||||
case AMDGPU::ENTER_STRICT_WQM: {
|
||||
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
|
||||
// Whole Wave Mode is exited.
|
||||
// STRICT_WQM is entered.
|
||||
const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
|
||||
const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
|
||||
BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
|
||||
|
||||
MI.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::EXIT_STRICT_WWM:
|
||||
case AMDGPU::EXIT_STRICT_WQM: {
|
||||
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
|
||||
// WWM/STICT_WQM is exited.
|
||||
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
|
||||
break;
|
||||
}
|
||||
|
@ -4407,6 +4420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
|||
case AMDGPU::WQM: return AMDGPU::WQM;
|
||||
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
|
||||
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
|
||||
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
|
||||
case AMDGPU::S_MOV_B32: {
|
||||
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
||||
return MI.getOperand(1).isReg() ||
|
||||
|
@ -6643,6 +6657,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
|
|||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::STRICT_WWM:
|
||||
case AMDGPU::STRICT_WQM:
|
||||
case AMDGPU::REG_SEQUENCE:
|
||||
case AMDGPU::PHI:
|
||||
case AMDGPU::INSERT_SUBREG:
|
||||
|
@ -6800,7 +6815,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
|
|||
case AMDGPU::INSERT_SUBREG:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::STRICT_WWM: {
|
||||
case AMDGPU::STRICT_WWM:
|
||||
case AMDGPU::STRICT_WQM: {
|
||||
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
|
||||
if (RI.hasAGPRs(SrcRC)) {
|
||||
if (RI.hasAGPRs(NewDstRC))
|
||||
|
|
|
@ -125,6 +125,7 @@ def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
|||
// accidentally clobber inactive channels of $vdst.
|
||||
let Constraints = "@earlyclobber $vdst" in {
|
||||
def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
}
|
||||
|
||||
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
|
||||
|
@ -143,6 +144,20 @@ def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
|
|||
let mayStore = 0;
|
||||
}
|
||||
|
||||
def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
|
||||
let Uses = [EXEC];
|
||||
let Defs = [EXEC, SCC];
|
||||
let hasSideEffects = 0;
|
||||
let mayLoad = 0;
|
||||
let mayStore = 0;
|
||||
}
|
||||
|
||||
def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
|
||||
let hasSideEffects = 0;
|
||||
let mayLoad = 0;
|
||||
let mayStore = 0;
|
||||
}
|
||||
|
||||
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
|
||||
// restoring it after we're done.
|
||||
let Defs = [SCC] in {
|
||||
|
|
|
@ -38,6 +38,9 @@ private:
|
|||
RegisterClassInfo RegClassInfo;
|
||||
|
||||
std::vector<unsigned> RegsToRewrite;
|
||||
#ifndef NDEBUG
|
||||
void printWWMInfo(const MachineInstr &MI);
|
||||
#endif
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
@ -154,6 +157,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
|
|||
MRI->freezeReservedRegs(MF);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
LLVM_DUMP_METHOD void
|
||||
SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
|
||||
|
||||
unsigned Opc = MI.getOpcode();
|
||||
|
||||
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
|
||||
dbgs() << "Entering ";
|
||||
} else {
|
||||
assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
|
||||
dbgs() << "Exiting ";
|
||||
}
|
||||
|
||||
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
|
||||
dbgs() << "Strict WWM ";
|
||||
} else {
|
||||
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
|
||||
dbgs() << "Strict WQM ";
|
||||
}
|
||||
|
||||
dbgs() << "region: " << MI;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
|
||||
LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
|
||||
|
||||
|
@ -185,21 +213,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
|
|||
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
|
||||
RegsAssigned |= processDef(MI.getOperand(0));
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) {
|
||||
LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
|
||||
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
|
||||
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
|
||||
LLVM_DEBUG(printWWMInfo(MI));
|
||||
InWWM = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) {
|
||||
LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
|
||||
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
|
||||
MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
|
||||
LLVM_DEBUG(printWWMInfo(MI));
|
||||
InWWM = false;
|
||||
}
|
||||
|
||||
if (!InWWM)
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
|
||||
LLVM_DEBUG(dbgs() << "Processing " << MI);
|
||||
|
||||
for (MachineOperand &DefOpnd : MI.defs()) {
|
||||
RegsAssigned |= processDef(DefOpnd);
|
||||
|
|
|
@ -7,8 +7,13 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// This pass adds instructions to enable whole quad mode for pixel
|
||||
/// shaders, and whole wavefront mode for all programs.
|
||||
/// This pass adds instructions to enable whole quad mode (strict or non-strict)
|
||||
/// for pixel shaders, and strict whole wavefront mode for all programs.
|
||||
///
|
||||
/// The "strict" prefix indicates that inactive lanes do not take part in
|
||||
/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
|
||||
/// always be enabled irrespective of control flow decisions. Conversely in
|
||||
/// non-strict WQM inactive lanes may control flow decisions.
|
||||
///
|
||||
/// Whole quad mode is required for derivative computations, but it interferes
|
||||
/// with shader side effects (stores and atomics). It ensures that WQM is
|
||||
|
@ -26,12 +31,21 @@
|
|||
/// ...
|
||||
/// S_MOV_B64 EXEC, Tmp
|
||||
///
|
||||
/// We also compute when a sequence of instructions requires Whole Wavefront
|
||||
/// Mode (StrictWWM) and insert instructions to save and restore it:
|
||||
/// We also compute when a sequence of instructions requires strict whole
|
||||
/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
|
||||
///
|
||||
/// S_OR_SAVEEXEC_B64 Tmp, -1
|
||||
/// ...
|
||||
/// S_MOV_B64 EXEC, Tmp
|
||||
/// S_OR_SAVEEXEC_B64 Tmp, -1
|
||||
/// ...
|
||||
/// S_MOV_B64 EXEC, Tmp
|
||||
///
|
||||
/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
|
||||
/// we use a similar save and restore mechanism and force whole quad mode for
|
||||
/// those instructions:
|
||||
///
|
||||
/// S_MOV_B64 Tmp, EXEC
|
||||
/// S_WQM_B64 EXEC, EXEC
|
||||
/// ...
|
||||
/// S_MOV_B64 EXEC, Tmp
|
||||
///
|
||||
/// In order to avoid excessive switching during sequences of Exact
|
||||
/// instructions, the pass first analyzes which instructions must be run in WQM
|
||||
|
@ -77,7 +91,9 @@ namespace {
|
|||
enum {
|
||||
StateWQM = 0x1,
|
||||
StateStrictWWM = 0x2,
|
||||
StateExact = 0x4,
|
||||
StateStrictWQM = 0x4,
|
||||
StateExact = 0x8,
|
||||
StateStrict = StateStrictWWM | StateStrictWQM,
|
||||
};
|
||||
|
||||
struct PrintState {
|
||||
|
@ -89,19 +105,23 @@ public:
|
|||
|
||||
#ifndef NDEBUG
|
||||
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
|
||||
if (PS.State & StateWQM)
|
||||
OS << "WQM";
|
||||
if (PS.State & StateStrictWWM) {
|
||||
if (PS.State & StateWQM)
|
||||
OS << '|';
|
||||
OS << "StrictWWM";
|
||||
}
|
||||
if (PS.State & StateExact) {
|
||||
if (PS.State & (StateWQM | StateStrictWWM))
|
||||
OS << '|';
|
||||
OS << "Exact";
|
||||
}
|
||||
|
||||
static const std::pair<char, const char *> Mapping[] = {
|
||||
std::make_pair(StateWQM, "WQM"),
|
||||
std::make_pair(StateStrictWWM, "StrictWWM"),
|
||||
std::make_pair(StateStrictWQM, "StrictWQM"),
|
||||
std::make_pair(StateExact, "Exact")};
|
||||
char State = PS.State;
|
||||
for (auto M : Mapping) {
|
||||
if (State & M.first) {
|
||||
OS << M.second;
|
||||
State &= ~M.first;
|
||||
|
||||
if (State)
|
||||
OS << '|';
|
||||
}
|
||||
}
|
||||
assert(State == 0);
|
||||
return OS;
|
||||
}
|
||||
#endif
|
||||
|
@ -151,7 +171,7 @@ private:
|
|||
DenseMap<const MachineInstr *, InstrInfo> Instructions;
|
||||
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
|
||||
|
||||
// Tracks state (WQM/StrictWWM/Exact) after a given instruction
|
||||
// Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
|
||||
DenseMap<const MachineInstr *, char> StateTransition;
|
||||
|
||||
SmallVector<MachineInstr *, 2> LiveMaskQueries;
|
||||
|
@ -184,10 +204,11 @@ private:
|
|||
Register SaveWQM);
|
||||
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SavedWQM);
|
||||
void toStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig);
|
||||
void fromStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig, char NonStrictWWMState);
|
||||
void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig, char StrictStateNeeded);
|
||||
void fromStrictMode(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before, Register SavedOrig,
|
||||
char NonStrictState, char CurrentStrictState);
|
||||
|
||||
MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
|
||||
|
||||
|
@ -473,9 +494,17 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
GlobalFlags |= StateStrictWWM;
|
||||
LowerToMovInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::STRICT_WQM) {
|
||||
// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
|
||||
// threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
|
||||
// quads that have at least one active thread.
|
||||
markInstructionUses(MI, StateStrictWQM, Worklist);
|
||||
GlobalFlags |= StateStrictWQM;
|
||||
LowerToMovInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
|
||||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
|
||||
III.Disabled = StateStrictWWM;
|
||||
III.Disabled = StateStrict;
|
||||
MachineOperand &Inactive = MI.getOperand(2);
|
||||
if (Inactive.isReg()) {
|
||||
if (Inactive.isUndef()) {
|
||||
|
@ -493,7 +522,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
Worklist.push_back(&MBB);
|
||||
}
|
||||
GlobalFlags |= StateExact;
|
||||
III.Disabled = StateWQM | StateStrictWWM;
|
||||
III.Disabled = StateWQM | StateStrict;
|
||||
continue;
|
||||
} else {
|
||||
if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
|
||||
|
@ -570,7 +599,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
|
|||
|
||||
// Propagate backwards within block
|
||||
if (MachineInstr *PrevMI = MI.getPrevNode()) {
|
||||
char InNeeds = (II.Needs & ~StateStrictWWM) | II.OutNeeds;
|
||||
char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
|
||||
if (!PrevMI->isPHI()) {
|
||||
InstrInfo &PrevII = Instructions[PrevMI];
|
||||
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
|
||||
|
@ -586,10 +615,12 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
|
|||
if (II.Needs != 0)
|
||||
markInstructionUses(MI, II.Needs, Worklist);
|
||||
|
||||
// Ensure we process a block containing StrictWWM, even if it does not require
|
||||
// any WQM transitions.
|
||||
// Ensure we process a block containing StrictWWM/StrictWQM, even if it does
|
||||
// not require any WQM transitions.
|
||||
if (II.Needs & StateStrictWWM)
|
||||
BI.Needs |= StateStrictWWM;
|
||||
if (II.Needs & StateStrictWQM)
|
||||
BI.Needs |= StateStrictWQM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
|
||||
|
@ -1105,30 +1136,48 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
|
|||
StateTransition[MI] = StateWQM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::toStrictWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig) {
|
||||
void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig, char StrictStateNeeded) {
|
||||
MachineInstr *MI;
|
||||
|
||||
assert(SaveOrig);
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
|
||||
SaveOrig)
|
||||
.addImm(-1);
|
||||
assert(StrictStateNeeded == StateStrictWWM ||
|
||||
StrictStateNeeded == StateStrictWQM);
|
||||
|
||||
if (StrictStateNeeded == StateStrictWWM) {
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
|
||||
SaveOrig)
|
||||
.addImm(-1);
|
||||
} else {
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
|
||||
SaveOrig)
|
||||
.addImm(-1);
|
||||
}
|
||||
LIS->InsertMachineInstrInMaps(*MI);
|
||||
StateTransition[MI] = StateStrictWWM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::fromStrictWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig,
|
||||
char NonStrictWWMState) {
|
||||
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig, char NonStrictState,
|
||||
char CurrentStrictState) {
|
||||
MachineInstr *MI;
|
||||
|
||||
assert(SavedOrig);
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), Exec)
|
||||
.addReg(SavedOrig);
|
||||
assert(CurrentStrictState == StateStrictWWM ||
|
||||
CurrentStrictState == StateStrictWQM);
|
||||
|
||||
if (CurrentStrictState == StateStrictWWM) {
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
|
||||
Exec)
|
||||
.addReg(SavedOrig);
|
||||
} else {
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
|
||||
Exec)
|
||||
.addReg(SavedOrig);
|
||||
}
|
||||
LIS->InsertMachineInstrInMaps(*MI);
|
||||
StateTransition[MI] = NonStrictWWMState;
|
||||
StateTransition[MI] = NonStrictState;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
|
@ -1149,10 +1198,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
<< ":\n");
|
||||
|
||||
Register SavedWQMReg;
|
||||
Register SavedNonStrictWWMReg;
|
||||
Register SavedNonStrictReg;
|
||||
bool WQMFromExec = IsEntry;
|
||||
char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
|
||||
char NonStrictWWMState = 0;
|
||||
char NonStrictState = 0;
|
||||
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
|
||||
|
||||
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
|
||||
|
@ -1166,25 +1215,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
// Exact or vice versa.
|
||||
MachineBasicBlock::iterator FirstWQM = IE;
|
||||
|
||||
// This stores the first instruction where it's safe to switch from StrictWWM
|
||||
// to Exact/WQM or to switch to StrictWWM. It must always be the same as, or
|
||||
// after, FirstWQM since if it's safe to switch to/from StrictWWM, it must be
|
||||
// safe to switch to/from WQM as well.
|
||||
MachineBasicBlock::iterator FirstStrictWWM = IE;
|
||||
// This stores the first instruction where it's safe to switch from Strict
|
||||
// mode to Exact/WQM or to switch to Strict mode. It must always be the same
|
||||
// as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
|
||||
// be safe to switch to/from WQM as well.
|
||||
MachineBasicBlock::iterator FirstStrict = IE;
|
||||
|
||||
// Record initial state is block information.
|
||||
BI.InitialState = State;
|
||||
|
||||
for (;;) {
|
||||
MachineBasicBlock::iterator Next = II;
|
||||
char Needs = StateExact | StateWQM; // StrictWWM is disabled by default
|
||||
char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
|
||||
char OutNeeds = 0;
|
||||
|
||||
if (FirstWQM == IE)
|
||||
FirstWQM = II;
|
||||
|
||||
if (FirstStrictWWM == IE)
|
||||
FirstStrictWWM = II;
|
||||
if (FirstStrict == IE)
|
||||
FirstStrict = II;
|
||||
|
||||
// First, figure out the allowed states (Needs) based on the propagated
|
||||
// flags.
|
||||
|
@ -1196,6 +1245,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
if (III != Instructions.end()) {
|
||||
if (III->second.Needs & StateStrictWWM)
|
||||
Needs = StateStrictWWM;
|
||||
else if (III->second.Needs & StateStrictWQM)
|
||||
Needs = StateStrictWQM;
|
||||
else if (III->second.Needs & StateWQM)
|
||||
Needs = StateWQM;
|
||||
else
|
||||
|
@ -1204,8 +1255,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
}
|
||||
} else {
|
||||
// If the instruction doesn't actually need a correct EXEC, then we can
|
||||
// safely leave StrictWWM enabled.
|
||||
Needs = StateExact | StateWQM | StateStrictWWM;
|
||||
// safely leave Strict mode enabled.
|
||||
Needs = StateExact | StateWQM | StateStrict;
|
||||
}
|
||||
|
||||
if (MI.isTerminator() && OutNeeds == StateExact)
|
||||
|
@ -1225,27 +1276,28 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
// Now, transition if necessary.
|
||||
if (!(Needs & State)) {
|
||||
MachineBasicBlock::iterator First;
|
||||
if (State == StateStrictWWM || Needs == StateStrictWWM) {
|
||||
// We must switch to or from StrictWWM
|
||||
First = FirstStrictWWM;
|
||||
if (State == StateStrictWWM || Needs == StateStrictWWM ||
|
||||
State == StateStrictWQM || Needs == StateStrictWQM) {
|
||||
// We must switch to or from Strict mode.
|
||||
First = FirstStrict;
|
||||
} else {
|
||||
// We only need to switch to/from WQM, so we can use FirstWQM
|
||||
// We only need to switch to/from WQM, so we can use FirstWQM.
|
||||
First = FirstWQM;
|
||||
}
|
||||
|
||||
// Whether we need to save SCC depends on start and end states
|
||||
// Whether we need to save SCC depends on start and end states.
|
||||
bool SaveSCC = false;
|
||||
switch (State) {
|
||||
case StateExact:
|
||||
case StateStrictWWM:
|
||||
// Exact/WWM -> WWM: save SCC
|
||||
// Exact/WWM -> WQM: save SCC if WQM mask is generated from exec
|
||||
// Exact/WWM -> Exact: no save
|
||||
SaveSCC =
|
||||
(Needs & StateStrictWWM) || ((Needs & StateWQM) && WQMFromExec);
|
||||
case StateStrictWQM:
|
||||
// Exact/Strict -> Strict: save SCC
|
||||
// Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
|
||||
// Exact/Strict -> Exact: no save
|
||||
SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
|
||||
break;
|
||||
case StateWQM:
|
||||
// WQM -> Exact/WMM: save SCC
|
||||
// WQM -> Exact/Strict: save SCC
|
||||
SaveSCC = !(Needs & StateWQM);
|
||||
break;
|
||||
default:
|
||||
|
@ -1255,20 +1307,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
MachineBasicBlock::iterator Before =
|
||||
prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
|
||||
|
||||
if (State == StateStrictWWM) {
|
||||
assert(SavedNonStrictWWMReg);
|
||||
fromStrictWWM(MBB, Before, SavedNonStrictWWMReg, NonStrictWWMState);
|
||||
LIS->createAndComputeVirtRegInterval(SavedNonStrictWWMReg);
|
||||
SavedNonStrictWWMReg = 0;
|
||||
State = NonStrictWWMState;
|
||||
if (State & StateStrict) {
|
||||
assert(State == StateStrictWWM || State == StateStrictWQM);
|
||||
assert(SavedNonStrictReg);
|
||||
fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
|
||||
|
||||
LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
|
||||
SavedNonStrictReg = 0;
|
||||
State = NonStrictState;
|
||||
}
|
||||
|
||||
if (Needs == StateStrictWWM) {
|
||||
NonStrictWWMState = State;
|
||||
assert(!SavedNonStrictWWMReg);
|
||||
SavedNonStrictWWMReg = MRI->createVirtualRegister(BoolRC);
|
||||
toStrictWWM(MBB, Before, SavedNonStrictWWMReg);
|
||||
State = StateStrictWWM;
|
||||
if (Needs & StateStrict) {
|
||||
NonStrictState = State;
|
||||
assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
|
||||
assert(!SavedNonStrictReg);
|
||||
SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
|
||||
|
||||
toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
|
||||
State = Needs;
|
||||
|
||||
} else {
|
||||
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
|
||||
if (!WQMFromExec && (OutNeeds & StateWQM)) {
|
||||
|
@ -1298,10 +1355,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
}
|
||||
}
|
||||
|
||||
if (Needs != (StateExact | StateWQM | StateStrictWWM)) {
|
||||
if (Needs != (StateExact | StateWQM | StateStrict)) {
|
||||
if (Needs != (StateExact | StateWQM))
|
||||
FirstWQM = IE;
|
||||
FirstStrictWWM = IE;
|
||||
FirstStrict = IE;
|
||||
}
|
||||
|
||||
if (II == IE)
|
||||
|
@ -1310,7 +1367,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
|||
II = Next;
|
||||
}
|
||||
assert(!SavedWQMReg);
|
||||
assert(!SavedNonStrictWWMReg);
|
||||
assert(!SavedNonStrictReg);
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::lowerLiveMaskQueries() {
|
||||
|
@ -1402,6 +1459,10 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
|
|||
}
|
||||
|
||||
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
||||
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
|
||||
<< " ------------- \n");
|
||||
LLVM_DEBUG(MF.dump(););
|
||||
|
||||
Instructions.clear();
|
||||
Blocks.clear();
|
||||
LiveMaskQueries.clear();
|
||||
|
@ -1442,10 +1503,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
|||
|
||||
LiveMaskReg = Exec;
|
||||
|
||||
// Shader is simple does not need WQM/StrictWWM or any complex lowering
|
||||
if (!(GlobalFlags & (StateWQM | StateStrictWWM)) &&
|
||||
LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() &&
|
||||
KillInstrs.empty()) {
|
||||
// Shader is simple does not need any state changes or any complex lowering
|
||||
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
|
||||
LowerToMovInstrs.empty() && KillInstrs.empty()) {
|
||||
lowerLiveMaskQueries();
|
||||
return !LiveMaskQueries.empty();
|
||||
}
|
||||
|
|
|
@ -186,13 +186,17 @@ main_body:
|
|||
; Check that we don't leave WWM on for computations that don't require WWM,
|
||||
; since that will lead clobbering things that aren't supposed to be clobbered
|
||||
; in cases like this.
|
||||
; We enforce this by checking that v_add gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_wwm3:
|
||||
;CHECK: %if
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
|
@ -215,13 +219,17 @@ endif:
|
|||
|
||||
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
|
||||
; write could clobber disabled channels in the non-WWM one.
|
||||
; We enforce this by checking that v_mov gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_wwm4:
|
||||
;CHECK: %if
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK-NEXT: v_mov_b32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
|
@ -277,6 +285,7 @@ main_body:
|
|||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_wwm6_then() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
|
@ -310,6 +319,7 @@ endif:
|
|||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endloop
|
||||
define amdgpu_ps float @test_wwm6_loop() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
|
@ -352,6 +362,208 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
; Check that Strict WQM is triggered by the strict_wqm intrinsic.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm1:
|
||||
;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Same as above, but with an integer type.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm2:
|
||||
;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_{{[iu]}}32_e32
|
||||
define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%src0.0 = bitcast float %src0 to i32
|
||||
%src1.0 = bitcast float %src1 to i32
|
||||
%out = add i32 %src0.0, %src1.0
|
||||
%out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
|
||||
%out.1 = bitcast i32 %out.0 to float
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Check that we don't leave Strict WQM on for computations that don't require it,
|
||||
; since that will lead clobbering things that aren't supposed to be clobbered
|
||||
; in cases like this.
|
||||
; We enforce this by checking that v_add gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm3:
|
||||
;CHECK: %if
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src, %src
|
||||
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
|
||||
%out.1 = fadd float %src, %out.0
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
; Check that Strict WQM writes aren't coalesced with non-strict writes, since
|
||||
; the Strict WQM write could clobber disabled channels in the non-strict one.
|
||||
; We enforce this by checking that v_mov gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm4:
|
||||
;CHECK: %if
|
||||
;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK-NEXT: v_mov_b32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src, %src
|
||||
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Make sure the transition from Exact to Strict WQM then WQM works properly.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm5:
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %src1, %src1
|
||||
%temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
|
||||
%out = fadd float %temp.0, %temp.0
|
||||
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that Strict WQM is turned on correctly across basic block boundaries.
|
||||
; if..then..endif version
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm6_then:
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: %if
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wqm6_then() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src1 = load volatile float, float addrspace(1)* undef
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Check that Strict WQM is turned on correctly across basic block boundaries.
|
||||
; loop version
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm6_loop:
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: %loop
|
||||
;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endloop
|
||||
define amdgpu_ps float @test_strict_wqm6_loop() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
|
||||
%src1 = load volatile float, float addrspace(1)* undef
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
|
||||
%counter.1 = sub i32 %counter, 1
|
||||
%cc = icmp ne i32 %counter.1, 0
|
||||
br i1 %cc, label %loop, label %endloop
|
||||
|
||||
endloop:
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that enabling WQM anywhere enables WQM for the set.inactive source.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_set_inactive2:
|
||||
|
@ -862,13 +1074,17 @@ main_body:
|
|||
; Check that we don't leave WWM on for computations that don't require WWM,
|
||||
; since that will lead clobbering things that aren't supposed to be clobbered
|
||||
; in cases like this.
|
||||
; We enforce this by checking that v_add gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm3:
|
||||
;CHECK: %if
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
|
@ -891,13 +1107,17 @@ endif:
|
|||
|
||||
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
|
||||
; write could clobber disabled channels in the non-WWM one.
|
||||
; We enforce this by checking that v_mov gets emitted in the same block as
|
||||
; WWM computations.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm4:
|
||||
;CHECK: %if
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK-NEXT: v_mov_b32_e32
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
|
@ -953,6 +1173,7 @@ main_body:
|
|||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endif
|
||||
define amdgpu_ps float @test_strict_wwm6_then() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
|
@ -986,6 +1207,7 @@ endif:
|
|||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
;CHECK: %endloop
|
||||
define amdgpu_ps float @test_strict_wwm6_loop() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
|
@ -1059,7 +1281,135 @@ ENDIF:
|
|||
ret float %r
|
||||
}
|
||||
|
||||
; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm:
|
||||
;CHECK: %IF
|
||||
;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: ds_swizzle
|
||||
;
|
||||
define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
||||
main_body:
|
||||
%c.bc = bitcast i32 %c to float
|
||||
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
||||
%tex0 = extractelement <4 x float> %tex, i32 0
|
||||
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
||||
%cmp = icmp eq i32 %z, 0
|
||||
br i1 %cmp, label %IF, label %ENDIF
|
||||
|
||||
IF:
|
||||
%dataf = extractelement <4 x float> %dtex, i32 0
|
||||
%data1 = fptosi float %dataf to i32
|
||||
%data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
|
||||
%data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
|
||||
%data3f = sitofp i32 %data3 to float
|
||||
br label %ENDIF
|
||||
|
||||
ENDIF:
|
||||
%r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm:
|
||||
;CHECK: buffer_store_dword
|
||||
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
|
||||
;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: v_add
|
||||
;CHECK: s_mov_b64 exec, [[ORIG3]]
|
||||
|
||||
;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: image_sample
|
||||
|
||||
define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %reload, %reload
|
||||
%temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
|
||||
%temp3 = fadd float %temp2, %temp2
|
||||
%reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
|
||||
%temp5 = fadd float %temp3, %temp4
|
||||
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
ret float %out
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm:
|
||||
;CHECK: buffer_store_dword
|
||||
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
|
||||
;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: v_add
|
||||
;CHECK: s_mov_b64 exec, [[ORIG3]]
|
||||
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: image_sample
|
||||
define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %reload, %reload
|
||||
%temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
|
||||
%temp3 = fadd float %temp2, %temp2
|
||||
%reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
|
||||
%temp5 = fadd float %temp3, %temp4
|
||||
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
ret float %out
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm:
|
||||
;CHECK: buffer_store_dword
|
||||
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
|
||||
;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
|
||||
;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
|
||||
;CHECK: image_sample
|
||||
|
||||
define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %reload, %reload
|
||||
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
|
||||
%temp2 = fadd float %tex, %tex
|
||||
%reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
|
||||
%temp4 = fadd float %temp2, %temp3
|
||||
%tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
ret float %out
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
|
||||
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
@ -1074,6 +1424,7 @@ declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32)
|
|||
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
||||
declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
||||
declare void @llvm.amdgcn.kill(i1) #1
|
||||
declare float @llvm.amdgcn.wqm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.wqm.i32(i32) #3
|
||||
|
@ -1081,6 +1432,8 @@ declare float @llvm.amdgcn.strict.wwm.f32(float) #3
|
|||
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.wwm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.wwm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.strict.wqm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
|
||||
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
|
||||
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
|
||||
|
|
Loading…
Reference in New Issue