2017-01-21 08:53:49 +08:00
|
|
|
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
|
2016-08-23 03:33:16 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2016-08-23 03:33:16 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
2018-05-01 23:54:18 +08:00
|
|
|
/// This pass inserts branches on the 0 exec mask over divergent branches
|
2016-08-23 03:33:16 +08:00
|
|
|
/// branches when it's expected that jumping over the untaken control flow will
|
|
|
|
/// be cheaper than having every workitem no-op through it.
|
|
|
|
//
|
2017-01-21 08:53:49 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
#include "llvm/ADT/DepthFirstIterator.h"
|
|
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
|
|
|
#include "llvm/InitializePasses.h"
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "si-insert-skips"
|
|
|
|
|
|
|
|
static cl::opt<unsigned> SkipThresholdFlag(
|
2020-01-22 12:07:55 +08:00
|
|
|
"amdgpu-skip-threshold-legacy",
|
2016-08-23 03:33:16 +08:00
|
|
|
cl::desc("Number of instructions before jumping over divergent control flow"),
|
|
|
|
cl::init(12), cl::Hidden);
|
|
|
|
|
2017-01-21 08:53:49 +08:00
|
|
|
namespace {
|
|
|
|
|
2016-08-23 03:33:16 +08:00
|
|
|
class SIInsertSkips : public MachineFunctionPass {
|
|
|
|
private:
|
2017-01-21 08:53:49 +08:00
|
|
|
const SIRegisterInfo *TRI = nullptr;
|
|
|
|
const SIInstrInfo *TII = nullptr;
|
|
|
|
unsigned SkipThreshold = 0;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
MachineDominatorTree *MDT = nullptr;
|
2016-08-23 03:33:16 +08:00
|
|
|
|
2020-07-03 08:57:42 +08:00
|
|
|
MachineBasicBlock *EarlyExitBlock = nullptr;
|
2020-10-06 08:44:50 +08:00
|
|
|
bool EarlyExitClearsExec = false;
|
2020-07-03 08:57:42 +08:00
|
|
|
|
2016-08-23 03:33:16 +08:00
|
|
|
bool shouldSkip(const MachineBasicBlock &From,
|
|
|
|
const MachineBasicBlock &To) const;
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
bool dominatesAllReachable(MachineBasicBlock &MBB);
|
2020-10-06 08:44:50 +08:00
|
|
|
void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
|
|
|
DebugLoc DL);
|
2016-08-23 03:33:16 +08:00
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
bool kill(MachineInstr &MI);
|
2021-01-13 12:08:42 +08:00
|
|
|
void earlyTerm(MachineInstr &MI);
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
|
|
|
|
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
2017-01-21 08:53:49 +08:00
|
|
|
SIInsertSkips() : MachineFunctionPass(ID) {}
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override {
|
2016-08-23 03:33:16 +08:00
|
|
|
return "SI insert s_cbranch_execz instructions";
|
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
AU.addRequired<MachineDominatorTree>();
|
2020-02-26 23:21:25 +08:00
|
|
|
AU.addPreserved<MachineDominatorTree>();
|
2016-08-23 03:33:16 +08:00
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2017-01-21 08:53:49 +08:00
|
|
|
} // end anonymous namespace
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
char SIInsertSkips::ID = 0;
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
|
|
|
|
"SI insert s_cbranch_execz instructions", false, false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
|
|
|
INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
|
|
|
|
"SI insert s_cbranch_execz instructions", false, false)
|
2016-08-23 03:33:16 +08:00
|
|
|
|
|
|
|
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
|
|
|
|
|
2019-06-07 08:14:55 +08:00
|
|
|
static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
|
|
|
|
if (MI.isMetaInstruction())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Handle target specific opcodes.
|
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
case AMDGPU::SI_MASK_BRANCH:
|
2016-08-23 03:33:16 +08:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
|
|
|
|
const MachineBasicBlock &To) const {
|
|
|
|
unsigned NumInstr = 0;
|
|
|
|
const MachineFunction *MF = From.getParent();
|
|
|
|
|
|
|
|
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
|
|
|
|
MBBI != End && MBBI != ToI; ++MBBI) {
|
|
|
|
const MachineBasicBlock &MBB = *MBBI;
|
|
|
|
|
|
|
|
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
|
|
|
|
NumInstr < SkipThreshold && I != E; ++I) {
|
2019-06-07 08:14:55 +08:00
|
|
|
if (opcodeEmitsNoInsts(*I))
|
2016-08-23 03:33:16 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// FIXME: Since this is required for correctness, this should be inserted
|
|
|
|
// during SILowerControlFlow.
|
|
|
|
|
|
|
|
// When a uniform loop is inside non-uniform control flow, the branch
|
|
|
|
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
|
|
|
|
// when EXEC = 0. We should skip the loop lest it becomes infinite.
|
|
|
|
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
|
|
|
|
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
|
|
|
|
return true;
|
|
|
|
|
AMDGPU: Force skip over s_sendmsg and exp instructions
Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.
Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48431
Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44
llvm-svn: 338235
2018-07-30 17:23:59 +08:00
|
|
|
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
|
2017-10-04 02:55:36 +08:00
|
|
|
return true;
|
2016-08-23 03:33:16 +08:00
|
|
|
|
AMDGPU: Force skip over SMRD, VMEM and s_waitcnt instructions
Summary: This fixes a large Dawn of War 3 performance regression with RADV from Mesa 19.0 to master which was caused by creating less code in some branches.
Reviewers: arsen, nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60824
llvm-svn: 358592
2019-04-18 00:31:52 +08:00
|
|
|
// These instructions are potentially expensive even if EXEC = 0.
|
2019-06-07 08:14:45 +08:00
|
|
|
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
|
|
|
|
I->getOpcode() == AMDGPU::S_WAITCNT)
|
AMDGPU: Force skip over SMRD, VMEM and s_waitcnt instructions
Summary: This fixes a large Dawn of War 3 performance regression with RADV from Mesa 19.0 to master which was caused by creating less code in some branches.
Reviewers: arsen, nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60824
llvm-svn: 358592
2019-04-18 00:31:52 +08:00
|
|
|
return true;
|
|
|
|
|
AMDGPU: Force skip over s_sendmsg and exp instructions
Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.
Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48431
Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44
llvm-svn: 338235
2018-07-30 17:23:59 +08:00
|
|
|
++NumInstr;
|
2016-08-23 03:33:16 +08:00
|
|
|
if (NumInstr >= SkipThreshold)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
/// Check whether \p MBB dominates all blocks that are reachable from it.
|
|
|
|
bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
|
|
|
|
for (MachineBasicBlock *Other : depth_first(&MBB)) {
|
|
|
|
if (!MDT->dominates(&MBB, Other))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2016-08-23 03:33:16 +08:00
|
|
|
|
2021-01-13 12:08:42 +08:00
|
|
|
static void generateEndPgm(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I, DebugLoc DL,
|
|
|
|
const SIInstrInfo *TII, bool IsPS) {
|
|
|
|
// "null export"
|
|
|
|
if (IsPS) {
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
|
|
|
|
.addImm(AMDGPU::Exp::ET_NULL)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
|
|
|
.addImm(1) // vm
|
|
|
|
.addImm(0) // compr
|
|
|
|
.addImm(0); // en
|
|
|
|
}
|
|
|
|
// s_endpgm
|
2020-07-03 08:57:42 +08:00
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
|
|
|
|
}
|
|
|
|
|
2020-10-06 08:44:50 +08:00
|
|
|
void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
|
|
|
|
bool ClearExec) {
|
2020-07-03 08:57:42 +08:00
|
|
|
MachineFunction *MF = MBB.getParent();
|
|
|
|
DebugLoc DL;
|
|
|
|
|
2020-10-06 08:44:50 +08:00
|
|
|
if (!EarlyExitBlock) {
|
|
|
|
EarlyExitBlock = MF->CreateMachineBasicBlock();
|
|
|
|
MF->insert(MF->end(), EarlyExitBlock);
|
2021-01-13 12:08:42 +08:00
|
|
|
generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
|
|
|
|
MF->getFunction().getCallingConv() ==
|
|
|
|
CallingConv::AMDGPU_PS);
|
2020-10-06 08:44:50 +08:00
|
|
|
EarlyExitClearsExec = false;
|
|
|
|
}
|
2020-07-03 08:57:42 +08:00
|
|
|
|
2020-10-06 08:44:50 +08:00
|
|
|
if (ClearExec && !EarlyExitClearsExec) {
|
|
|
|
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
|
|
|
unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
|
|
Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
|
|
|
auto ExitI = EarlyExitBlock->getFirstNonPHI();
|
|
|
|
BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
|
|
|
|
EarlyExitClearsExec = true;
|
|
|
|
}
|
2020-07-03 08:57:42 +08:00
|
|
|
}
|
|
|
|
|
2020-10-02 09:52:06 +08:00
|
|
|
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
|
|
|
|
MachineDominatorTree *MDT) {
|
|
|
|
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
|
|
|
|
|
|
|
|
// Update dominator tree
|
|
|
|
using DomTreeT = DomTreeBase<MachineBasicBlock>;
|
|
|
|
SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
|
|
|
|
for (MachineBasicBlock *Succ : SplitBB->successors()) {
|
|
|
|
DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
|
|
|
|
DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
|
|
|
|
}
|
|
|
|
DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
|
|
|
|
MDT->getBase().applyUpdates(DTUpdates);
|
|
|
|
}
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
|
|
|
|
/// iterator. Only applies to pixel shaders.
|
|
|
|
void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I, DebugLoc DL) {
|
|
|
|
MachineFunction *MF = MBB.getParent();
|
2020-10-02 16:22:53 +08:00
|
|
|
(void)MF;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
|
|
|
|
|
|
|
|
// It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
|
|
|
|
// basic block that has no further successors (e.g., there was an
|
|
|
|
// `unreachable` there in IR). This can happen with original source of the
|
|
|
|
// form:
|
|
|
|
//
|
|
|
|
// if (uniform_condition) {
|
|
|
|
// write_to_memory();
|
|
|
|
// discard;
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// In this case, we write the "null_export; s_endpgm" skip code in the
|
|
|
|
// already-existing basic block.
|
|
|
|
auto NextBBI = std::next(MBB.getIterator());
|
2020-12-05 13:42:54 +08:00
|
|
|
bool NoSuccessor =
|
|
|
|
I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
|
|
|
|
if (NoSuccessor) {
|
2021-01-13 12:08:42 +08:00
|
|
|
generateEndPgm(MBB, I, DL, TII, true);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
} else {
|
2020-10-06 08:44:50 +08:00
|
|
|
ensureEarlyExitBlock(MBB, false);
|
2016-08-23 03:33:16 +08:00
|
|
|
|
2020-10-02 09:52:06 +08:00
|
|
|
MachineInstr *BranchMI =
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
|
|
|
.addMBB(EarlyExitBlock);
|
2020-07-03 08:57:42 +08:00
|
|
|
|
|
|
|
// Split the block if the branch will not come at the end.
|
|
|
|
auto Next = std::next(BranchMI->getIterator());
|
2020-10-02 09:52:06 +08:00
|
|
|
if (Next != MBB.end() && !Next->isTerminator())
|
|
|
|
splitBlock(MBB, *BranchMI, MDT);
|
2016-08-23 03:33:16 +08:00
|
|
|
|
2020-07-03 08:57:42 +08:00
|
|
|
MBB.addSuccessor(EarlyExitBlock);
|
|
|
|
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
}
|
2016-08-23 03:33:16 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
|
|
|
|
/// Return true unless the terminator is a no-op.
|
|
|
|
bool SIInsertSkips::kill(MachineInstr &MI) {
|
2016-08-23 03:33:16 +08:00
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
2017-10-24 18:27:13 +08:00
|
|
|
|
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
|
|
|
|
unsigned Opcode = 0;
|
|
|
|
|
|
|
|
// The opcodes are inverted because the inline immediate has to be
|
|
|
|
// the first operand, e.g. from "x < imm" to "imm > x"
|
|
|
|
switch (MI.getOperand(2).getImm()) {
|
|
|
|
case ISD::SETOEQ:
|
|
|
|
case ISD::SETEQ:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_LT_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETGE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_LE_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOLT:
|
|
|
|
case ISD::SETLT:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_GT_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETOLE:
|
|
|
|
case ISD::SETLE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_GE_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETNE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_LG_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETO:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_O_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUO:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_U_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUEQ:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUGT:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUGE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETULT:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETULE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
case ISD::SETUNE:
|
2018-01-30 07:19:10 +08:00
|
|
|
Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
|
2017-10-24 18:27:13 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid ISD:SET cond code");
|
|
|
|
}
|
|
|
|
|
2019-04-27 07:16:16 +08:00
|
|
|
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
|
|
|
if (ST.hasNoSdstCMPX())
|
|
|
|
Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
|
|
|
|
|
2018-01-30 07:19:10 +08:00
|
|
|
assert(MI.getOperand(0).isReg());
|
|
|
|
|
|
|
|
if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
|
|
|
|
MI.getOperand(0).getReg())) {
|
|
|
|
Opcode = AMDGPU::getVOPe32(Opcode);
|
2018-02-24 03:11:33 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(Opcode))
|
2018-01-30 07:19:10 +08:00
|
|
|
.add(MI.getOperand(1))
|
|
|
|
.add(MI.getOperand(0));
|
|
|
|
} else {
|
2019-04-27 07:16:16 +08:00
|
|
|
auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
|
|
|
|
if (!ST.hasNoSdstCMPX())
|
|
|
|
I.addReg(AMDGPU::VCC, RegState::Define);
|
|
|
|
|
|
|
|
I.addImm(0) // src0 modifiers
|
|
|
|
.add(MI.getOperand(1))
|
|
|
|
.addImm(0) // src1 modifiers
|
|
|
|
.add(MI.getOperand(0));
|
|
|
|
|
2019-06-03 20:07:41 +08:00
|
|
|
I.addImm(0); // omod
|
2018-01-30 07:19:10 +08:00
|
|
|
}
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
return true;
|
2017-10-24 18:27:13 +08:00
|
|
|
}
|
|
|
|
case AMDGPU::SI_KILL_I1_TERMINATOR: {
|
2019-06-17 01:13:09 +08:00
|
|
|
const MachineFunction *MF = MI.getParent()->getParent();
|
|
|
|
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
|
|
|
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
2017-10-24 18:27:13 +08:00
|
|
|
const MachineOperand &Op = MI.getOperand(0);
|
|
|
|
int64_t KillVal = MI.getOperand(1).getImm();
|
|
|
|
assert(KillVal == 0 || KillVal == -1);
|
|
|
|
|
|
|
|
// Kill all threads if Op0 is an immediate and equal to the Kill value.
|
|
|
|
if (Op.isImm()) {
|
|
|
|
int64_t Imm = Op.getImm();
|
|
|
|
assert(Imm == 0 || Imm == -1);
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
if (Imm == KillVal) {
|
2019-06-17 01:13:09 +08:00
|
|
|
BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
|
|
|
|
: AMDGPU::S_MOV_B64), Exec)
|
2017-10-24 18:27:13 +08:00
|
|
|
.addImm(0);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
2016-08-23 03:33:16 +08:00
|
|
|
}
|
2017-10-24 18:27:13 +08:00
|
|
|
|
|
|
|
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
|
2019-06-17 01:13:09 +08:00
|
|
|
if (ST.isWave32())
|
|
|
|
Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
|
|
|
|
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
|
|
|
|
.addReg(Exec)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(Op);
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
return true;
|
2017-10-24 18:27:13 +08:00
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
|
2016-08-23 03:33:16 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-13 12:08:42 +08:00
|
|
|
void SIInsertSkips::earlyTerm(MachineInstr &MI) {
|
|
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
|
|
const DebugLoc DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
ensureEarlyExitBlock(MBB, true);
|
|
|
|
|
|
|
|
auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
|
|
|
|
.addMBB(EarlyExitBlock);
|
|
|
|
auto Next = std::next(MI.getIterator());
|
|
|
|
|
|
|
|
if (Next != MBB.end() && !Next->isTerminator())
|
|
|
|
splitBlock(MBB, *BranchMI, MDT);
|
|
|
|
|
|
|
|
MBB.addSuccessor(EarlyExitBlock);
|
|
|
|
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
|
|
|
|
}
|
|
|
|
|
2016-08-23 03:33:16 +08:00
|
|
|
// Returns true if a branch over the block was inserted.
|
|
|
|
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
|
|
|
|
MachineBasicBlock &SrcMBB) {
|
|
|
|
MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
|
|
|
|
|
|
|
|
if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
|
|
|
|
|
|
|
|
BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
|
|
|
.addMBB(DestBB);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
2018-07-12 04:59:01 +08:00
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
2016-08-23 03:33:16 +08:00
|
|
|
TII = ST.getInstrInfo();
|
|
|
|
TRI = &TII->getRegisterInfo();
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
MDT = &getAnalysis<MachineDominatorTree>();
|
2016-08-23 03:33:16 +08:00
|
|
|
SkipThreshold = SkipThresholdFlag;
|
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
SmallVector<MachineInstr *, 4> KillInstrs;
|
2021-01-13 12:08:42 +08:00
|
|
|
SmallVector<MachineInstr *, 4> EarlyTermInstrs;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
bool MadeChange = false;
|
2016-08-23 03:33:16 +08:00
|
|
|
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
2016-08-23 03:33:16 +08:00
|
|
|
MachineBasicBlock::iterator I, Next;
|
|
|
|
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
|
|
|
Next = std::next(I);
|
|
|
|
MachineInstr &MI = *I;
|
|
|
|
|
|
|
|
switch (MI.getOpcode()) {
|
2017-01-21 08:53:49 +08:00
|
|
|
case AMDGPU::SI_MASK_BRANCH:
|
2016-08-23 03:33:16 +08:00
|
|
|
MadeChange |= skipMaskBranch(MI, MBB);
|
|
|
|
break;
|
2017-01-21 08:53:49 +08:00
|
|
|
|
|
|
|
case AMDGPU::S_BRANCH:
|
2016-08-23 03:33:16 +08:00
|
|
|
// Optimize out branches to the next block.
|
|
|
|
// FIXME: Shouldn't this be handled by BranchFolding?
|
2017-01-25 06:18:39 +08:00
|
|
|
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
assert(&MI == &MBB.back());
|
2016-08-23 03:33:16 +08:00
|
|
|
MI.eraseFromParent();
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
MadeChange = true;
|
2017-01-25 06:18:39 +08:00
|
|
|
}
|
2016-08-23 03:33:16 +08:00
|
|
|
break;
|
2017-01-21 08:53:49 +08:00
|
|
|
|
2017-10-24 18:27:13 +08:00
|
|
|
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
case AMDGPU::SI_KILL_I1_TERMINATOR: {
|
2016-08-23 03:33:16 +08:00
|
|
|
MadeChange = true;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
bool CanKill = kill(MI);
|
|
|
|
|
|
|
|
// Check if we can add an early "if exec=0 { end shader }".
|
|
|
|
//
|
|
|
|
// Note that we _always_ do this if it is correct, even if the kill
|
|
|
|
// happens fairly late in the shader, because the null export should
|
|
|
|
// generally still be cheaper than normal export(s).
|
|
|
|
//
|
|
|
|
// TODO: The dominatesAllReachable check is conservative: if the
|
|
|
|
// dominance is only missing due to _uniform_ branches, we could
|
|
|
|
// in fact insert the early-exit as well.
|
|
|
|
if (CanKill &&
|
|
|
|
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
|
|
|
|
dominatesAllReachable(MBB)) {
|
|
|
|
// Mark the instruction for kill-if-dead insertion. We delay this
|
|
|
|
// change because it modifies the CFG.
|
|
|
|
KillInstrs.push_back(&MI);
|
2016-08-23 03:33:16 +08:00
|
|
|
} else {
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
MI.eraseFromParent();
|
2016-08-23 03:33:16 +08:00
|
|
|
}
|
|
|
|
break;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
}
|
2017-01-21 08:53:49 +08:00
|
|
|
|
2020-07-03 11:25:33 +08:00
|
|
|
case AMDGPU::SI_KILL_CLEANUP:
|
|
|
|
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
|
|
|
|
dominatesAllReachable(MBB)) {
|
|
|
|
KillInstrs.push_back(&MI);
|
|
|
|
} else {
|
|
|
|
MI.eraseFromParent();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2021-01-13 12:08:42 +08:00
|
|
|
case AMDGPU::SI_EARLY_TERMINATE_SCC0:
|
|
|
|
EarlyTermInstrs.push_back(&MI);
|
|
|
|
break;
|
|
|
|
|
2016-08-23 03:33:16 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-13 12:08:42 +08:00
|
|
|
for (MachineInstr *Instr : EarlyTermInstrs) {
|
|
|
|
// Early termination in GS does nothing
|
|
|
|
if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
|
|
|
|
earlyTerm(*Instr);
|
|
|
|
Instr->eraseFromParent();
|
|
|
|
}
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
for (MachineInstr *Kill : KillInstrs) {
|
|
|
|
skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
|
|
|
|
Kill->getDebugLoc());
|
|
|
|
Kill->eraseFromParent();
|
|
|
|
}
|
|
|
|
KillInstrs.clear();
|
2021-01-13 12:08:42 +08:00
|
|
|
EarlyTermInstrs.clear();
|
2020-07-03 08:57:42 +08:00
|
|
|
EarlyExitBlock = nullptr;
|
AMDGPU/SIInsertSkips: Fix the determination of whether early-exit-after-kill is possible
Summary:
The old code made some incorrect assumptions about the order in which
basic blocks are laid out in a function. This could lead to incorrect
early-exits, especially when kills occurred inside of loops.
The new approach is to check whether the point where the conditional
kill occurs dominates all reachable code. If that is the case, there
cannot be any other threads in the wave that are waiting to rejoin
at a later point in the CFG, i.e. if exec=0 at that point, then all
threads really are dead and we can exit the wave.
Make some other minor cleanups to the pass while we're at it.
v2: preserve the dominator tree
Reviewers: arsenm, cdevadas, foad, critson
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74908
Change-Id: Ia0d2b113ac944ad642d1c622b6da1b20aa1aabcc
2020-02-21 20:36:01 +08:00
|
|
|
|
2016-08-23 03:33:16 +08:00
|
|
|
return MadeChange;
|
|
|
|
}
|