[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief Computations in WWM can overwrite values in inactive channels for
|
|
|
|
/// variables that the register allocator thinks are dead. This pass adds fake
|
|
|
|
/// uses of those variables to WWM instructions to make sure that they aren't
|
|
|
|
/// overwritten.
|
|
|
|
///
|
|
|
|
/// As an example, consider this snippet:
|
|
|
|
/// %vgpr0 = V_MOV_B32_e32 0.0
|
|
|
|
/// if (...) {
|
|
|
|
/// %vgpr1 = ...
|
2017-12-07 18:40:31 +08:00
|
|
|
/// %vgpr2 = WWM killed %vgpr1
|
|
|
|
/// ... = killed %vgpr2
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
/// %vgpr0 = V_MOV_B32_e32 1.0
|
|
|
|
/// }
|
|
|
|
/// ... = %vgpr0
|
|
|
|
///
|
|
|
|
/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
|
|
|
|
/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
|
|
|
|
/// writing %vgpr1 would only write to channels that would be clobbered by the
|
|
|
|
/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
|
|
|
|
/// it would clobber even the inactive channels for which the if-condition is
|
|
|
|
/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
|
|
|
|
/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
|
|
|
|
/// same register.
|
|
|
|
///
|
|
|
|
/// In general, we need to figure out what registers might have their inactive
|
|
|
|
/// channels which are eventually used accidentally clobbered by a WWM
|
|
|
|
/// instruction. We approximate this using two conditions:
|
|
|
|
///
|
|
|
|
/// 1. A definition of the variable reaches the WWM instruction.
|
|
|
|
/// 2. The variable would be live at the WWM instruction if all its defs were
|
|
|
|
/// partial defs (i.e. considered as a use), ignoring normal uses.
|
|
|
|
///
|
|
|
|
/// If a register matches both conditions, then we add an implicit use of it to
|
|
|
|
/// the WWM instruction. Condition #2 is the heart of the matter: every
|
|
|
|
/// definition is really a partial definition, since every VALU instruction is
|
|
|
|
/// implicitly predicated. We can usually ignore this, but WWM forces us not
|
|
|
|
/// to. Condition #1 prevents false positives if the variable is undefined at
|
|
|
|
/// the WWM instruction anyways. This is overly conservative in certain cases,
|
|
|
|
/// especially in uniform control flow, but this is a workaround anyways until
|
|
|
|
/// LLVM gains the notion of predicated uses and definitions of variables.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIRegisterInfo.h"
|
|
|
|
#include "llvm/ADT/DepthFirstIterator.h"
|
|
|
|
#include "llvm/ADT/SparseBitVector.h"
|
2017-12-13 10:51:04 +08:00
|
|
|
#include "llvm/CodeGen/LiveIntervals.h"
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
|
|
#include "llvm/CodeGen/Passes.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "si-fix-wwm-liveness"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
class SIFixWWMLiveness : public MachineFunctionPass {
|
|
|
|
private:
|
|
|
|
LiveIntervals *LIS = nullptr;
|
|
|
|
const SIRegisterInfo *TRI;
|
|
|
|
MachineRegisterInfo *MRI;
|
|
|
|
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
|
|
|
SIFixWWMLiveness() : MachineFunctionPass(ID) {
|
|
|
|
initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
|
|
|
|
bool runOnWWMInstruction(MachineInstr &MI);
|
|
|
|
|
|
|
|
void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
|
|
|
|
|
|
|
|
StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
// Should preserve the same set that TwoAddressInstructions does.
|
|
|
|
AU.addPreserved<SlotIndexes>();
|
|
|
|
AU.addPreserved<LiveIntervals>();
|
|
|
|
AU.addPreservedID(LiveVariablesID);
|
|
|
|
AU.addPreservedID(MachineLoopInfoID);
|
|
|
|
AU.addPreservedID(MachineDominatorsID);
|
|
|
|
AU.setPreservesCFG();
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
} // End anonymous namespace.
|
|
|
|
|
|
|
|
INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
|
|
|
|
"SI fix WWM liveness", false, false)
|
|
|
|
|
|
|
|
char SIFixWWMLiveness::ID = 0;
|
|
|
|
|
|
|
|
char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
|
|
|
|
|
|
|
|
FunctionPass *llvm::createSIFixWWMLivenessPass() {
|
|
|
|
return new SIFixWWMLiveness();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
|
|
|
|
{
|
|
|
|
for (const MachineOperand &Op : MI.defs()) {
|
|
|
|
if (Op.isReg()) {
|
|
|
|
unsigned Reg = Op.getReg();
|
|
|
|
if (TRI->isVGPR(*MRI, Reg))
|
|
|
|
Regs.set(Reg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
|
|
|
|
MachineBasicBlock *MBB = WWM.getParent();
|
|
|
|
|
|
|
|
// Compute the registers that are live out of MI by figuring out which defs
|
|
|
|
// are reachable from MI.
|
|
|
|
SparseBitVector<> LiveOut;
|
|
|
|
|
|
|
|
for (auto II = MachineBasicBlock::iterator(WWM), IE =
|
|
|
|
MBB->end(); II != IE; ++II) {
|
|
|
|
addDefs(*II, LiveOut);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
|
|
|
|
E = df_end(MBB);
|
|
|
|
I != E; ++I) {
|
|
|
|
for (const MachineInstr &MI : **I) {
|
|
|
|
addDefs(MI, LiveOut);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute the registers that reach MI.
|
|
|
|
SparseBitVector<> Reachable;
|
|
|
|
|
|
|
|
for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
|
|
|
|
MBB->rend(); II != IE; ++II) {
|
|
|
|
addDefs(*II, Reachable);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
|
|
|
|
E = idf_end(MBB);
|
|
|
|
I != E; ++I) {
|
|
|
|
for (const MachineInstr &MI : **I) {
|
|
|
|
addDefs(MI, Reachable);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// find the intersection, and add implicit uses.
|
|
|
|
LiveOut &= Reachable;
|
|
|
|
|
|
|
|
bool Modified = false;
|
|
|
|
for (unsigned Reg : LiveOut) {
|
|
|
|
WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
|
|
|
|
if (LIS) {
|
|
|
|
// FIXME: is there a better way to update the live interval?
|
|
|
|
LIS->removeInterval(Reg);
|
|
|
|
LIS->createAndComputeVirtRegInterval(Reg);
|
|
|
|
}
|
|
|
|
Modified = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Modified;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
|
|
|
|
bool Modified = false;
|
|
|
|
|
|
|
|
// This doesn't actually need LiveIntervals, but we can preserve them.
|
|
|
|
LIS = getAnalysisIfAvailable<LiveIntervals>();
|
|
|
|
|
|
|
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
|
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
|
|
|
|
|
|
TRI = &TII->getRegisterInfo();
|
|
|
|
MRI = &MF.getRegInfo();
|
|
|
|
|
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
|
|
for (MachineInstr &MI : MBB) {
|
|
|
|
if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
|
|
|
|
Modified |= runOnWWMInstruction(MI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Modified;
|
|
|
|
}
|