2017-08-08 08:47:13 +08:00
|
|
|
//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
|
2017-04-12 11:25:12 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-04-12 11:25:12 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Insert wait instructions for memory reads and writes.
|
2017-04-12 11:25:12 +08:00
|
|
|
///
|
|
|
|
/// Memory reads and writes are issued asynchronously, so we need to insert
|
|
|
|
/// S_WAITCNT instructions when we want to access any of their results or
|
|
|
|
/// overwrite any register that's used asynchronously.
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
///
|
|
|
|
/// TODO: This pass currently keeps one timeline per hardware counter. A more
|
|
|
|
/// finely-grained approach that keeps one timeline per event type could
|
|
|
|
/// sometimes get away with generating weaker s_waitcnt instructions. For
|
|
|
|
/// example, when both SMEM and LDS are in flight and we need to wait for
|
|
|
|
/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
|
|
|
|
/// but the pass will currently generate a conservative lgkmcnt(0) because
|
|
|
|
/// multiple event types are in flight.
|
2017-04-12 11:25:12 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
|
|
#include "SIDefines.h"
|
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIMachineFunctionInfo.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "SIRegisterInfo.h"
|
2017-04-12 11:25:12 +08:00
|
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/ADT/DenseMap.h"
|
|
|
|
#include "llvm/ADT/DenseSet.h"
|
2020-04-29 22:46:33 +08:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2017-04-12 11:25:12 +08:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
2017-04-12 11:25:12 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2017-04-12 11:25:12 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/CodeGen/MachineMemOperand.h"
|
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2020-01-04 23:23:14 +08:00
|
|
|
#include "llvm/CodeGen/MachinePostDominators.h"
|
2017-04-12 11:25:12 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2020-01-04 23:23:14 +08:00
|
|
|
#include "llvm/InitializePasses.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/IR/DebugLoc.h"
|
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2018-04-26 03:21:26 +08:00
|
|
|
#include "llvm/Support/DebugCounter.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <cstring>
|
|
|
|
#include <memory>
|
|
|
|
#include <utility>
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-04-26 03:21:26 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
#define DEBUG_TYPE "si-insert-waitcnts"
|
|
|
|
|
2018-04-26 03:21:26 +08:00
|
|
|
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
|
|
|
|
"Force emit s_waitcnt expcnt(0) instrs");
|
|
|
|
DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
|
|
|
|
"Force emit s_waitcnt lgkmcnt(0) instrs");
|
|
|
|
DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
|
|
|
|
"Force emit s_waitcnt vmcnt(0) instrs");
|
|
|
|
|
2019-03-15 05:23:59 +08:00
|
|
|
static cl::opt<bool> ForceEmitZeroFlag(
|
2018-04-26 03:21:26 +08:00
|
|
|
"amdgpu-waitcnt-forcezero",
|
|
|
|
cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
|
2019-03-15 05:23:59 +08:00
|
|
|
cl::init(false), cl::Hidden);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
template <typename EnumT>
|
|
|
|
class enum_iterator
|
|
|
|
: public iterator_facade_base<enum_iterator<EnumT>,
|
|
|
|
std::forward_iterator_tag, const EnumT> {
|
|
|
|
EnumT Value;
|
|
|
|
public:
|
|
|
|
enum_iterator() = default;
|
|
|
|
enum_iterator(EnumT Value) : Value(Value) {}
|
|
|
|
|
|
|
|
enum_iterator &operator++() {
|
|
|
|
Value = static_cast<EnumT>(Value + 1);
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
|
|
|
|
|
|
|
|
EnumT operator*() const { return Value; }
|
|
|
|
};
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// Class of object that encapsulates latest instruction counter score
|
|
|
|
// associated with the operand. Used for determining whether
|
|
|
|
// s_waitcnt instruction needs to be emited.
|
|
|
|
|
|
|
|
#define CNT_MASK(t) (1u << (t))
|
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
|
|
|
|
return make_range(enum_iterator<InstCounterType>(VM_CNT),
|
|
|
|
enum_iterator<InstCounterType>(NUM_INST_CNTS));
|
|
|
|
}
|
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
using RegInterval = std::pair<int, int>;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
struct {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned VmcntMax;
|
|
|
|
unsigned ExpcntMax;
|
|
|
|
unsigned LgkmcntMax;
|
|
|
|
unsigned VscntMax;
|
2017-04-12 11:25:12 +08:00
|
|
|
} HardwareLimits;
|
|
|
|
|
|
|
|
struct {
|
|
|
|
unsigned VGPR0;
|
|
|
|
unsigned VGPRL;
|
|
|
|
unsigned SGPR0;
|
|
|
|
unsigned SGPRL;
|
|
|
|
} RegisterEncoding;
|
|
|
|
|
|
|
|
enum WaitEventType {
|
|
|
|
VMEM_ACCESS, // vector-memory read & write
|
2019-05-04 05:53:53 +08:00
|
|
|
VMEM_READ_ACCESS, // vector-memory read
|
|
|
|
VMEM_WRITE_ACCESS,// vector-memory write
|
2017-04-12 11:25:12 +08:00
|
|
|
LDS_ACCESS, // lds read & write
|
|
|
|
GDS_ACCESS, // gds read & write
|
|
|
|
SQ_MESSAGE, // send message
|
|
|
|
SMEM_ACCESS, // scalar-memory read & write
|
|
|
|
EXP_GPR_LOCK, // export holding on its data src
|
|
|
|
GDS_GPR_LOCK, // GDS holding on its data and addr src
|
|
|
|
EXP_POS_ACCESS, // write to export position
|
|
|
|
EXP_PARAM_ACCESS, // write to export parameter
|
|
|
|
VMW_GPR_LOCK, // vector-memory write holding on its data src
|
|
|
|
NUM_WAIT_EVENTS,
|
|
|
|
};
|
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
|
2019-05-04 05:53:53 +08:00
|
|
|
(1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
|
|
|
|
(1 << SQ_MESSAGE),
|
|
|
|
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
|
|
|
|
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
|
2019-05-04 05:53:53 +08:00
|
|
|
(1 << VMEM_WRITE_ACCESS)
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
};
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// The mapping is:
|
|
|
|
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
|
|
|
|
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
|
|
|
|
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
|
|
|
|
// We reserve a fixed number of VGPR slots in the scoring tables for
|
|
|
|
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
|
|
|
|
enum RegisterMapping {
|
|
|
|
SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
|
|
|
|
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
|
|
|
|
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
|
|
|
|
EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
|
|
|
|
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
|
|
|
|
};
|
|
|
|
|
2020-04-30 17:51:09 +08:00
|
|
|
// Enumerate different types of result-returning VMEM operations. Although
|
|
|
|
// s_waitcnt orders them all with a single vmcnt counter, in the absence of
|
|
|
|
// s_waitcnt only instructions of the same VmemType are guaranteed to write
|
|
|
|
// their results in order -- so there is no need to insert an s_waitcnt between
|
|
|
|
// two instructions of the same type that write the same vgpr.
|
|
|
|
enum VmemType {
|
|
|
|
// BUF instructions and MIMG instructions without a sampler.
|
|
|
|
VMEM_NOSAMPLER,
|
|
|
|
// MIMG instructions with a sampler.
|
|
|
|
VMEM_SAMPLER,
|
|
|
|
};
|
|
|
|
|
|
|
|
VmemType getVmemType(const MachineInstr &Inst) {
|
|
|
|
assert(SIInstrInfo::isVMEM(Inst));
|
|
|
|
if (!SIInstrInfo::isMIMG(Inst))
|
|
|
|
return VMEM_NOSAMPLER;
|
|
|
|
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
|
|
|
|
return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
|
|
|
|
? VMEM_SAMPLER
|
|
|
|
: VMEM_NOSAMPLER;
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
|
|
|
|
switch (T) {
|
|
|
|
case VM_CNT:
|
|
|
|
Wait.VmCnt = std::min(Wait.VmCnt, Count);
|
|
|
|
break;
|
|
|
|
case EXP_CNT:
|
|
|
|
Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
|
|
|
|
break;
|
|
|
|
case LGKM_CNT:
|
|
|
|
Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
|
|
|
|
break;
|
2019-05-04 05:53:53 +08:00
|
|
|
case VS_CNT:
|
|
|
|
Wait.VsCnt = std::min(Wait.VsCnt, Count);
|
|
|
|
break;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("bad InstCounterType");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
// This objects maintains the current score brackets of each wait counter, and
|
|
|
|
// a per-register scoreboard for each wait counter.
|
|
|
|
//
|
2017-04-12 11:25:12 +08:00
|
|
|
// We also maintain the latest score for every event type that can change the
|
|
|
|
// waitcnt in order to know if there are multiple types of events within
|
|
|
|
// the brackets. When multiple types of event happen in the bracket,
|
2018-03-15 06:04:32 +08:00
|
|
|
// wait count may get decreased out of order, therefore we need to put in
|
2017-04-12 11:25:12 +08:00
|
|
|
// "s_waitcnt 0" before use.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
class WaitcntBrackets {
|
2017-04-12 11:25:12 +08:00
|
|
|
public:
|
2020-04-29 21:10:56 +08:00
|
|
|
WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
|
2017-08-08 08:47:13 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
static unsigned getWaitCountMax(InstCounterType T) {
|
2017-04-12 11:25:12 +08:00
|
|
|
switch (T) {
|
|
|
|
case VM_CNT:
|
|
|
|
return HardwareLimits.VmcntMax;
|
|
|
|
case LGKM_CNT:
|
|
|
|
return HardwareLimits.LgkmcntMax;
|
|
|
|
case EXP_CNT:
|
|
|
|
return HardwareLimits.ExpcntMax;
|
2019-05-04 05:53:53 +08:00
|
|
|
case VS_CNT:
|
|
|
|
return HardwareLimits.VscntMax;
|
2017-04-12 11:25:12 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 0;
|
2017-08-08 08:47:13 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned getScoreLB(InstCounterType T) const {
|
2017-04-12 11:25:12 +08:00
|
|
|
assert(T < NUM_INST_CNTS);
|
|
|
|
return ScoreLBs[T];
|
2017-08-08 08:47:13 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned getScoreUB(InstCounterType T) const {
|
2017-04-12 11:25:12 +08:00
|
|
|
assert(T < NUM_INST_CNTS);
|
|
|
|
return ScoreUBs[T];
|
2017-08-08 08:47:13 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
// Mapping from event to counter.
|
|
|
|
InstCounterType eventCounter(WaitEventType E) {
|
2019-05-04 05:53:53 +08:00
|
|
|
if (WaitEventMaskForInst[VM_CNT] & (1 << E))
|
2017-04-12 11:25:12 +08:00
|
|
|
return VM_CNT;
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
|
2017-04-12 11:25:12 +08:00
|
|
|
return LGKM_CNT;
|
2019-05-04 05:53:53 +08:00
|
|
|
if (WaitEventMaskForInst[VS_CNT] & (1 << E))
|
|
|
|
return VS_CNT;
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
|
|
|
|
return EXP_CNT;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned getRegScore(int GprNo, InstCounterType T) {
|
2017-04-12 11:25:12 +08:00
|
|
|
if (GprNo < NUM_ALL_VGPRS) {
|
|
|
|
return VgprScores[T][GprNo];
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
assert(T == LGKM_CNT);
|
2017-04-12 11:25:12 +08:00
|
|
|
return SgprScores[GprNo - NUM_ALL_VGPRS];
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool merge(const WaitcntBrackets &Other);
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
|
|
|
|
const MachineRegisterInfo *MRI,
|
2020-04-28 03:14:52 +08:00
|
|
|
const SIRegisterInfo *TRI, unsigned OpNo) const;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Some more const-correctness
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54225
llvm-svn: 347192
2018-11-19 20:03:11 +08:00
|
|
|
bool counterOutOfOrder(InstCounterType T) const;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
|
|
|
|
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
|
2020-04-30 22:05:40 +08:00
|
|
|
void determineWait(InstCounterType T, unsigned ScoreToWait,
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
AMDGPU::Waitcnt &Wait) const;
|
|
|
|
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
|
|
|
|
void applyWaitcnt(InstCounterType T, unsigned Count);
|
2017-04-12 11:25:12 +08:00
|
|
|
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
|
|
|
|
const MachineRegisterInfo *MRI, WaitEventType E,
|
|
|
|
MachineInstr &MI);
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool hasPending() const { return PendingEvents != 0; }
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
bool hasPendingEvent(WaitEventType E) const {
|
|
|
|
return PendingEvents & (1 << E);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 01:40:17 +08:00
|
|
|
bool hasMixedPendingEvents(InstCounterType T) const {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned Events = PendingEvents & WaitEventMaskForInst[T];
|
2020-04-30 01:40:17 +08:00
|
|
|
// Return true if more than one bit is set in Events.
|
|
|
|
return Events & (Events - 1);
|
|
|
|
}
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
bool hasPendingFlat() const {
|
|
|
|
return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
|
|
|
|
LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
|
|
|
|
(LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
|
|
|
|
LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
|
|
|
|
}
|
|
|
|
|
|
|
|
void setPendingFlat() {
|
|
|
|
LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
|
|
|
|
LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
|
|
|
|
}
|
|
|
|
|
2020-04-30 17:51:09 +08:00
|
|
|
// Return true if there might be pending writes to the specified vgpr by VMEM
|
|
|
|
// instructions with types different from V.
|
|
|
|
bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
|
|
|
|
assert(GprNo < NUM_ALL_VGPRS);
|
|
|
|
return VgprVmemTypes[GprNo] & ~(1 << V);
|
|
|
|
}
|
|
|
|
|
|
|
|
void clearVgprVmemTypes(int GprNo) {
|
|
|
|
assert(GprNo < NUM_ALL_VGPRS);
|
|
|
|
VgprVmemTypes[GprNo] = 0;
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void print(raw_ostream &);
|
|
|
|
void dump() { print(dbgs()); }
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
private:
|
|
|
|
struct MergeInfo {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned OldLB;
|
|
|
|
unsigned OtherLB;
|
|
|
|
unsigned MyShift;
|
|
|
|
unsigned OtherShift;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
};
|
2020-04-30 22:05:40 +08:00
|
|
|
static bool mergeScore(const MergeInfo &M, unsigned &Score,
|
|
|
|
unsigned OtherScore);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
void setScoreLB(InstCounterType T, unsigned Val) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
assert(T < NUM_INST_CNTS);
|
|
|
|
ScoreLBs[T] = Val;
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
void setScoreUB(InstCounterType T, unsigned Val) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
assert(T < NUM_INST_CNTS);
|
|
|
|
ScoreUBs[T] = Val;
|
|
|
|
if (T == EXP_CNT) {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
|
|
|
|
ScoreLBs[T] = UB;
|
|
|
|
}
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (GprNo < NUM_ALL_VGPRS) {
|
2020-04-29 21:10:56 +08:00
|
|
|
VgprUB = std::max(VgprUB, GprNo);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
VgprScores[T][GprNo] = Val;
|
|
|
|
} else {
|
|
|
|
assert(T == LGKM_CNT);
|
2020-04-29 21:10:56 +08:00
|
|
|
SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
|
|
|
|
const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned OpNo, unsigned Val);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-07-12 04:59:01 +08:00
|
|
|
const GCNSubtarget *ST = nullptr;
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
|
|
|
|
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
|
|
|
|
unsigned PendingEvents = 0;
|
2017-04-12 11:25:12 +08:00
|
|
|
// Remember the last flat memory operation.
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned LastFlat[NUM_INST_CNTS] = {0};
|
2017-04-12 11:25:12 +08:00
|
|
|
// wait_cnt scores for every vgpr.
|
|
|
|
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
|
2020-04-30 22:05:40 +08:00
|
|
|
int VgprUB = -1;
|
|
|
|
int SgprUB = -1;
|
|
|
|
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
|
2017-04-12 11:25:12 +08:00
|
|
|
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
|
2020-04-30 17:51:09 +08:00
|
|
|
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
|
|
|
|
// write to each vgpr.
|
|
|
|
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
|
2017-04-12 11:25:12 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
class SIInsertWaitcnts : public MachineFunctionPass {
|
|
|
|
private:
|
2018-07-12 04:59:01 +08:00
|
|
|
const GCNSubtarget *ST = nullptr;
|
2017-08-08 08:47:13 +08:00
|
|
|
const SIInstrInfo *TII = nullptr;
|
|
|
|
const SIRegisterInfo *TRI = nullptr;
|
|
|
|
const MachineRegisterInfo *MRI = nullptr;
|
2018-09-13 02:50:47 +08:00
|
|
|
AMDGPU::IsaVersion IV;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-02-07 10:21:21 +08:00
|
|
|
DenseSet<MachineInstr *> TrackedWaitcntSet;
|
2020-01-04 23:23:14 +08:00
|
|
|
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
|
|
|
|
MachinePostDominatorTree *PDT;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
struct BlockInfo {
|
|
|
|
MachineBasicBlock *MBB;
|
|
|
|
std::unique_ptr<WaitcntBrackets> Incoming;
|
|
|
|
bool Dirty = true;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
|
|
|
|
};
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-29 22:46:33 +08:00
|
|
|
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-05-07 22:43:28 +08:00
|
|
|
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
|
|
|
|
// because of amdgpu-waitcnt-forcezero flag
|
|
|
|
bool ForceEmitZeroWaitcnts;
|
2018-04-26 03:21:26 +08:00
|
|
|
bool ForceEmitWaitcnt[NUM_INST_CNTS];
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
2018-06-27 05:33:38 +08:00
|
|
|
SIInsertWaitcnts() : MachineFunctionPass(ID) {
|
|
|
|
(void)ForceExpCounter;
|
|
|
|
(void)ForceLgkmCounter;
|
|
|
|
(void)ForceVMCounter;
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
|
|
|
|
StringRef getPassName() const override {
|
|
|
|
return "SI insert wait instructions";
|
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
2020-01-04 23:23:14 +08:00
|
|
|
AU.addRequired<MachinePostDominatorTree>();
|
2017-04-12 11:25:12 +08:00
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
|
2018-04-26 03:21:26 +08:00
|
|
|
bool isForceEmitWaitcnt() const {
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
for (auto T : inst_counter_types())
|
2018-04-26 03:21:26 +08:00
|
|
|
if (ForceEmitWaitcnt[T])
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void setForceEmitWaitcnt() {
|
|
|
|
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
|
|
|
|
// For debug builds, get the debug counter info and adjust if need be
|
|
|
|
#ifndef NDEBUG
|
|
|
|
if (DebugCounter::isCounterSet(ForceExpCounter) &&
|
|
|
|
DebugCounter::shouldExecute(ForceExpCounter)) {
|
|
|
|
ForceEmitWaitcnt[EXP_CNT] = true;
|
|
|
|
} else {
|
|
|
|
ForceEmitWaitcnt[EXP_CNT] = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
|
|
|
|
DebugCounter::shouldExecute(ForceLgkmCounter)) {
|
|
|
|
ForceEmitWaitcnt[LGKM_CNT] = true;
|
|
|
|
} else {
|
|
|
|
ForceEmitWaitcnt[LGKM_CNT] = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (DebugCounter::isCounterSet(ForceVMCounter) &&
|
|
|
|
DebugCounter::shouldExecute(ForceVMCounter)) {
|
|
|
|
ForceEmitWaitcnt[VM_CNT] = true;
|
|
|
|
} else {
|
|
|
|
ForceEmitWaitcnt[VM_CNT] = false;
|
|
|
|
}
|
|
|
|
#endif // NDEBUG
|
|
|
|
}
|
|
|
|
|
2017-07-22 02:54:54 +08:00
|
|
|
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool generateWaitcntInstBefore(MachineInstr &MI,
|
|
|
|
WaitcntBrackets &ScoreBrackets,
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
MachineInstr *OldWaitcntInstr);
|
2020-09-04 18:36:29 +08:00
|
|
|
bool generateWaitcntInstAfter(MachineInstr &MI,
|
|
|
|
WaitcntBrackets &ScoreBrackets,
|
|
|
|
MachineInstr *OldWaitcntInstr);
|
2018-04-24 23:59:59 +08:00
|
|
|
void updateEventWaitcntAfter(MachineInstr &Inst,
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
WaitcntBrackets *ScoreBrackets);
|
|
|
|
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
|
|
|
|
WaitcntBrackets &ScoreBrackets);
|
2017-04-12 11:25:12 +08:00
|
|
|
};
|
|
|
|
|
2017-08-08 08:47:13 +08:00
|
|
|
} // end anonymous namespace
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
const MachineRegisterInfo *MRI,
|
|
|
|
const SIRegisterInfo *TRI,
|
2020-04-28 03:14:52 +08:00
|
|
|
unsigned OpNo) const {
|
2017-04-12 11:25:12 +08:00
|
|
|
const MachineOperand &Op = MI->getOperand(OpNo);
|
2020-04-28 03:14:52 +08:00
|
|
|
assert(Op.isReg());
|
|
|
|
if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
|
2017-04-12 11:25:12 +08:00
|
|
|
return {-1, -1};
|
|
|
|
|
|
|
|
// A use via a PW operand does not need a waitcnt.
|
|
|
|
// A partial write is not a WAW.
|
|
|
|
assert(!Op.getSubReg() || !Op.isUndef());
|
|
|
|
|
|
|
|
RegInterval Result;
|
|
|
|
|
|
|
|
unsigned Reg = TRI->getEncodingValue(Op.getReg());
|
|
|
|
|
2020-04-28 00:15:33 +08:00
|
|
|
if (TRI->isVGPR(*MRI, Op.getReg())) {
|
2017-04-12 11:25:12 +08:00
|
|
|
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
|
|
|
|
Result.first = Reg - RegisterEncoding.VGPR0;
|
|
|
|
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
|
2020-04-28 00:15:33 +08:00
|
|
|
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
|
2017-04-12 11:25:12 +08:00
|
|
|
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
|
|
|
|
Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
|
|
|
|
assert(Result.first >= NUM_ALL_VGPRS &&
|
|
|
|
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
|
|
|
|
}
|
|
|
|
// TODO: Handle TTMP
|
2020-04-28 00:15:33 +08:00
|
|
|
// else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
|
2017-04-12 11:25:12 +08:00
|
|
|
else
|
|
|
|
return {-1, -1};
|
|
|
|
|
2020-04-28 00:15:33 +08:00
|
|
|
const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
|
2017-04-25 02:55:33 +08:00
|
|
|
unsigned Size = TRI->getRegSizeInBits(*RC);
|
2020-05-26 17:09:46 +08:00
|
|
|
Result.second = Result.first + ((Size + 16) / 32);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void WaitcntBrackets::setExpScore(const MachineInstr *MI,
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
const SIRegisterInfo *TRI,
|
|
|
|
const MachineRegisterInfo *MRI, unsigned OpNo,
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned Val) {
|
2020-04-28 03:14:52 +08:00
|
|
|
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
|
2020-04-29 21:10:56 +08:00
|
|
|
assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
|
2020-04-30 22:05:40 +08:00
|
|
|
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setRegScore(RegNo, EXP_CNT, Val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
|
|
|
|
const SIRegisterInfo *TRI,
|
|
|
|
const MachineRegisterInfo *MRI,
|
|
|
|
WaitEventType E, MachineInstr &Inst) {
|
2017-04-12 11:25:12 +08:00
|
|
|
InstCounterType T = eventCounter(E);
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned CurrScore = getScoreUB(T) + 1;
|
AMDGPU/InsertWaitcnt: Consistently use uint32_t for scores / time points
Summary:
There is one obsolete reference to using -1 as an indication of "unknown",
but this isn't actually used anywhere.
Using unsigned makes robust wrapping checks easier.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, llvm-commits, tpr, t-tye, hakzsam
Differential Revision: https://reviews.llvm.org/D54230
llvm-svn: 347852
2018-11-29 19:06:21 +08:00
|
|
|
if (CurrScore == 0)
|
|
|
|
report_fatal_error("InsertWaitcnt score wraparound");
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
// PendingEvents and ScoreUB need to be update regardless if this event
|
|
|
|
// changes the score of a register or not.
|
2017-04-12 11:25:12 +08:00
|
|
|
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
|
2020-04-30 01:40:17 +08:00
|
|
|
PendingEvents |= 1 << E;
|
2017-04-12 11:25:12 +08:00
|
|
|
setScoreUB(T, CurrScore);
|
|
|
|
|
|
|
|
if (T == EXP_CNT) {
|
|
|
|
// Put score on the source vgprs. If this is a store, just use those
|
|
|
|
// specific register(s).
|
|
|
|
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
|
2019-06-20 03:55:27 +08:00
|
|
|
int AddrOpIdx =
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
|
2017-04-12 11:25:12 +08:00
|
|
|
// All GDS operations must protect their address register (same as
|
|
|
|
// export.)
|
2019-06-20 03:55:27 +08:00
|
|
|
if (AddrOpIdx != -1) {
|
|
|
|
setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
2019-06-20 03:55:27 +08:00
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
if (Inst.mayStore()) {
|
2019-01-16 23:43:53 +08:00
|
|
|
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
|
|
|
AMDGPU::OpName::data0) != -1) {
|
|
|
|
setExpScore(
|
|
|
|
&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
|
|
|
|
CurrScore);
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
|
|
|
AMDGPU::OpName::data1) != -1) {
|
|
|
|
setExpScore(&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
|
|
|
AMDGPU::OpName::data1),
|
|
|
|
CurrScore);
|
|
|
|
}
|
|
|
|
} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_APPEND &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_CONSUME &&
|
|
|
|
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
|
|
|
|
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
|
|
|
|
const MachineOperand &Op = Inst.getOperand(I);
|
2020-04-28 00:15:33 +08:00
|
|
|
if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (TII->isFLAT(Inst)) {
|
|
|
|
if (Inst.mayStore()) {
|
|
|
|
setExpScore(
|
|
|
|
&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
|
|
|
|
CurrScore);
|
|
|
|
} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
|
|
|
|
setExpScore(
|
|
|
|
&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
|
|
|
|
CurrScore);
|
|
|
|
}
|
|
|
|
} else if (TII->isMIMG(Inst)) {
|
|
|
|
if (Inst.mayStore()) {
|
|
|
|
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
|
|
|
|
} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
|
|
|
|
setExpScore(
|
|
|
|
&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
|
|
|
|
CurrScore);
|
|
|
|
}
|
|
|
|
} else if (TII->isMTBUF(Inst)) {
|
|
|
|
if (Inst.mayStore()) {
|
|
|
|
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
|
|
|
|
}
|
|
|
|
} else if (TII->isMUBUF(Inst)) {
|
|
|
|
if (Inst.mayStore()) {
|
|
|
|
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
|
|
|
|
} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
|
|
|
|
setExpScore(
|
|
|
|
&Inst, TII, TRI, MRI,
|
|
|
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
|
|
|
|
CurrScore);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (TII->isEXP(Inst)) {
|
|
|
|
// For export the destination registers are really temps that
|
|
|
|
// can be used as the actual source after export patching, so
|
|
|
|
// we need to treat them like sources and set the EXP_CNT
|
|
|
|
// score.
|
|
|
|
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
|
|
|
|
MachineOperand &DefMO = Inst.getOperand(I);
|
|
|
|
if (DefMO.isReg() && DefMO.isDef() &&
|
2020-04-28 00:15:33 +08:00
|
|
|
TRI->isVGPR(*MRI, DefMO.getReg())) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
|
|
|
|
CurrScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
|
|
|
|
MachineOperand &MO = Inst.getOperand(I);
|
2020-04-28 00:15:33 +08:00
|
|
|
if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if 0 // TODO: check if this is handled by MUBUF code above.
|
|
|
|
} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
|
2017-08-17 00:47:29 +08:00
|
|
|
Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
|
|
|
|
Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
|
2017-04-12 11:25:12 +08:00
|
|
|
MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
|
|
|
|
unsigned OpNo;//TODO: find the OpNo for this operand;
|
2020-04-28 03:14:52 +08:00
|
|
|
RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
|
2020-04-30 22:05:40 +08:00
|
|
|
for (int RegNo = Interval.first; RegNo < Interval.second;
|
2017-08-17 00:47:29 +08:00
|
|
|
++RegNo) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
// Match the score to the destination registers.
|
|
|
|
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
|
2020-04-28 03:14:52 +08:00
|
|
|
auto &Op = Inst.getOperand(I);
|
|
|
|
if (!Op.isReg() || !Op.isDef())
|
|
|
|
continue;
|
|
|
|
RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
|
2020-04-30 17:51:09 +08:00
|
|
|
if (T == VM_CNT) {
|
|
|
|
if (Interval.first >= NUM_ALL_VGPRS)
|
|
|
|
continue;
|
|
|
|
if (SIInstrInfo::isVMEM(Inst)) {
|
|
|
|
VmemType V = getVmemType(Inst);
|
|
|
|
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
|
|
|
|
VgprVmemTypes[RegNo] |= 1 << V;
|
|
|
|
}
|
|
|
|
}
|
2020-04-30 22:05:40 +08:00
|
|
|
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
|
2017-04-12 11:25:12 +08:00
|
|
|
setRegScore(RegNo, T, CurrScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (TII->isDS(Inst) && Inst.mayStore()) {
|
|
|
|
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void WaitcntBrackets::print(raw_ostream &OS) {
|
2017-04-12 11:25:12 +08:00
|
|
|
OS << '\n';
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
for (auto T : inst_counter_types()) {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned LB = getScoreLB(T);
|
|
|
|
unsigned UB = getScoreUB(T);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
switch (T) {
|
|
|
|
case VM_CNT:
|
|
|
|
OS << " VM_CNT(" << UB - LB << "): ";
|
|
|
|
break;
|
|
|
|
case LGKM_CNT:
|
|
|
|
OS << " LGKM_CNT(" << UB - LB << "): ";
|
|
|
|
break;
|
|
|
|
case EXP_CNT:
|
|
|
|
OS << " EXP_CNT(" << UB - LB << "): ";
|
|
|
|
break;
|
2019-05-04 05:53:53 +08:00
|
|
|
case VS_CNT:
|
|
|
|
OS << " VS_CNT(" << UB - LB << "): ";
|
|
|
|
break;
|
2017-04-12 11:25:12 +08:00
|
|
|
default:
|
|
|
|
OS << " UNKNOWN(" << UB - LB << "): ";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (LB < UB) {
|
|
|
|
// Print vgpr scores.
|
2020-04-29 21:10:56 +08:00
|
|
|
for (int J = 0; J <= VgprUB; J++) {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned RegScore = getRegScore(J, T);
|
2017-04-12 11:25:12 +08:00
|
|
|
if (RegScore <= LB)
|
|
|
|
continue;
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned RelScore = RegScore - LB - 1;
|
2017-04-12 11:25:12 +08:00
|
|
|
if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
|
|
|
|
OS << RelScore << ":v" << J << " ";
|
|
|
|
} else {
|
|
|
|
OS << RelScore << ":ds ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Also need to print sgpr scores for lgkm_cnt.
|
|
|
|
if (T == LGKM_CNT) {
|
2020-04-29 21:10:56 +08:00
|
|
|
for (int J = 0; J <= SgprUB; J++) {
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
|
2017-04-12 11:25:12 +08:00
|
|
|
if (RegScore <= LB)
|
|
|
|
continue;
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned RelScore = RegScore - LB - 1;
|
2017-04-12 11:25:12 +08:00
|
|
|
OS << RelScore << ":s" << J << " ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
OS << '\n';
|
|
|
|
}
|
|
|
|
OS << '\n';
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
|
|
|
|
/// whether a waitcnt instruction is needed at all.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
|
|
|
|
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
|
2019-05-04 05:53:53 +08:00
|
|
|
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
|
|
|
|
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
|
|
|
|
unsigned &Count) const {
|
2020-04-30 22:05:40 +08:00
|
|
|
const unsigned LB = getScoreLB(T);
|
|
|
|
const unsigned UB = getScoreUB(T);
|
AMDGPU/InsertWaitcnt: Consistently use uint32_t for scores / time points
Summary:
There is one obsolete reference to using -1 as an indication of "unknown",
but this isn't actually used anywhere.
Using unsigned makes robust wrapping checks easier.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, llvm-commits, tpr, t-tye, hakzsam
Differential Revision: https://reviews.llvm.org/D54230
llvm-svn: 347852
2018-11-29 19:06:21 +08:00
|
|
|
if (Count < UB && UB - Count > LB)
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
Count = ~0u;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
AMDGPU::Waitcnt &Wait) const {
|
2017-04-12 11:25:12 +08:00
|
|
|
// If the score of src_operand falls within the bracket, we need an
|
|
|
|
// s_waitcnt instruction.
|
2020-04-30 22:05:40 +08:00
|
|
|
const unsigned LB = getScoreLB(T);
|
|
|
|
const unsigned UB = getScoreUB(T);
|
2017-04-12 11:25:12 +08:00
|
|
|
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
|
2018-06-05 00:51:59 +08:00
|
|
|
if ((T == VM_CNT || T == LGKM_CNT) &&
|
|
|
|
hasPendingFlat() &&
|
|
|
|
!ST->hasFlatLgkmVMemCountInOrder()) {
|
|
|
|
// If there is a pending FLAT operation, and this is a VMem or LGKM
|
|
|
|
// waitcnt and the target can report early completion, then we need
|
|
|
|
// to force a waitcnt 0.
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
addWait(Wait, T, 0);
|
2017-04-12 11:25:12 +08:00
|
|
|
} else if (counterOutOfOrder(T)) {
|
|
|
|
// Counter can get decremented out-of-order when there
|
2018-03-15 06:04:32 +08:00
|
|
|
// are multiple types event in the bracket. Also emit an s_wait counter
|
2017-04-12 11:25:12 +08:00
|
|
|
// with a conservative value of 0 for the counter.
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
addWait(Wait, T, 0);
|
2017-04-12 11:25:12 +08:00
|
|
|
} else {
|
AMDGPU: Handle waitcnt overflow
Summary:
The waitcnt pass can overflow the counters when the number of outstanding events
for a type exceed the capacity of the counter. This can lead to inefficient
insertion of waitcnts, or to waitcnt instructions with max values for each type.
The last situation can cause an instruction which when disassembled appears to
be an illegal waitcnt without an operand.
In these cases we should add a wait for the 'counter maximum' - 1, and update the
waitcnt brackets accordingly.
Reviewers: rampitec, arsenm
Reviewed By: rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70418
2019-11-03 05:48:40 +08:00
|
|
|
// If a counter has been maxed out avoid overflow by waiting for
|
|
|
|
// MAX(CounterType) - 1 instead.
|
2020-04-30 22:05:40 +08:00
|
|
|
unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
|
AMDGPU: Handle waitcnt overflow
Summary:
The waitcnt pass can overflow the counters when the number of outstanding events
for a type exceed the capacity of the counter. This can lead to inefficient
insertion of waitcnts, or to waitcnt instructions with max values for each type.
The last situation can cause an instruction which when disassembled appears to
be an illegal waitcnt without an operand.
In these cases we should add a wait for the 'counter maximum' - 1, and update the
waitcnt brackets accordingly.
Reviewers: rampitec, arsenm
Reviewed By: rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70418
2019-11-03 05:48:40 +08:00
|
|
|
addWait(Wait, T, NeededWait);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
applyWaitcnt(VM_CNT, Wait.VmCnt);
|
|
|
|
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
|
|
|
|
applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
|
2019-05-04 05:53:53 +08:00
|
|
|
applyWaitcnt(VS_CNT, Wait.VsCnt);
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
|
2020-04-30 22:05:40 +08:00
|
|
|
const unsigned UB = getScoreUB(T);
|
AMDGPU/InsertWaitcnt: Consistently use uint32_t for scores / time points
Summary:
There is one obsolete reference to using -1 as an indication of "unknown",
but this isn't actually used anywhere.
Using unsigned makes robust wrapping checks easier.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, llvm-commits, tpr, t-tye, hakzsam
Differential Revision: https://reviews.llvm.org/D54230
llvm-svn: 347852
2018-11-29 19:06:21 +08:00
|
|
|
if (Count >= UB)
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
return;
|
|
|
|
if (Count != 0) {
|
|
|
|
if (counterOutOfOrder(T))
|
|
|
|
return;
|
AMDGPU/InsertWaitcnt: Consistently use uint32_t for scores / time points
Summary:
There is one obsolete reference to using -1 as an indication of "unknown",
but this isn't actually used anywhere.
Using unsigned makes robust wrapping checks easier.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, llvm-commits, tpr, t-tye, hakzsam
Differential Revision: https://reviews.llvm.org/D54230
llvm-svn: 347852
2018-11-29 19:06:21 +08:00
|
|
|
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
} else {
|
|
|
|
setScoreLB(T, UB);
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
PendingEvents &= ~WaitEventMaskForInst[T];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// Where there are multiple types of event in the bracket of a counter,
|
|
|
|
// the decrement may go out of order.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
// Scalar memory read always can go out of order.
|
|
|
|
if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
|
|
|
|
return true;
|
2020-04-30 01:40:17 +08:00
|
|
|
return hasMixedPendingEvents(T);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
|
|
|
|
false)
|
2020-01-04 23:23:14 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
|
2017-04-12 11:25:12 +08:00
|
|
|
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
|
|
|
|
false)
|
|
|
|
|
|
|
|
char SIInsertWaitcnts::ID = 0;
|
|
|
|
|
|
|
|
char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
|
|
|
|
|
|
|
|
FunctionPass *llvm::createSIInsertWaitcntsPass() {
|
|
|
|
return new SIInsertWaitcnts();
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool readsVCCZ(const MachineInstr &MI) {
|
|
|
|
unsigned Opc = MI.getOpcode();
|
|
|
|
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
|
|
|
|
!MI.getOperand(1).isUndef();
|
|
|
|
}
|
|
|
|
|
2020-09-04 18:36:29 +08:00
|
|
|
// For jumps like function calls and returns, we insert waitcnts at the jump
|
|
|
|
// destination. The jump adds latency because new instructions need to be
|
|
|
|
// fetched, waiting for outstanding memory operations after the jump means
|
|
|
|
// that the memory latency can be overlapped with the jump latency.
|
|
|
|
|
|
|
|
// FIXME callWaitsOnFunctionEntry and callWaitsOnFunctionReturn should depend
|
|
|
|
// on the calling convention. We need to track the calling convention for
|
|
|
|
// call instructions, so it is available here.
|
|
|
|
|
2019-06-15 05:52:26 +08:00
|
|
|
/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
|
2020-09-04 18:36:29 +08:00
|
|
|
static bool callWaitsOnFunctionEntry() {
|
2019-06-15 05:52:26 +08:00
|
|
|
// Currently all conventions wait, but this may not always be the case.
|
|
|
|
//
|
|
|
|
// TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
|
|
|
|
// senses to omit the wait and do it in the caller.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \returns true if the callee is expected to wait for any outstanding waits
|
|
|
|
/// before returning.
|
2020-09-04 18:36:29 +08:00
|
|
|
static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return false; }
|
|
|
|
|
|
|
|
/// \returns true if the instruction is an s_sendmsg of gs-done.
|
|
|
|
static bool isSendGsDoneMessage(const MachineInstr &MI) {
|
|
|
|
return (MI.getOpcode() == AMDGPU::S_SENDMSG ||
|
|
|
|
MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
|
|
|
|
(MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
|
|
|
|
AMDGPU::SendMsg::ID_GS_DONE;
|
2019-06-15 05:52:26 +08:00
|
|
|
}
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Generate s_waitcnt instruction to be placed before cur_Inst.
|
2017-04-12 11:25:12 +08:00
|
|
|
/// Instructions of a given type are returned in order,
|
|
|
|
/// but instructions of different types can complete out of order.
|
|
|
|
/// We rely on this in-order completion
|
|
|
|
/// and simply assign a score to the memory access instructions.
|
|
|
|
/// We keep track of the active "score bracket" to determine
|
|
|
|
/// if an access of a memory read requires an s_waitcnt
|
|
|
|
/// and if so what the value of each counter is.
|
|
|
|
/// The "score bracket" is bound by the lower bound and upper bound
|
|
|
|
/// scores (*_score_LB and *_score_ub respectively).
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|
|
|
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
MachineInstr *OldWaitcntInstr) {
|
2018-05-07 22:43:28 +08:00
|
|
|
setForceEmitWaitcnt();
|
2018-04-26 03:21:26 +08:00
|
|
|
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
|
|
|
|
|
2020-09-09 22:24:35 +08:00
|
|
|
if (MI.isMetaInstruction())
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
return false;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
AMDGPU::Waitcnt Wait;
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// See if this instruction has a forced S_WAITCNT VM.
|
|
|
|
// TODO: Handle other cases of NeedsWaitcntVmBefore()
|
AMDGPU/InsertWaitcnt: Remove unused WaitAtBeginning
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54229
llvm-svn: 347851
2018-11-29 19:06:18 +08:00
|
|
|
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
|
2019-05-04 05:53:53 +08:00
|
|
|
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
|
|
|
|
MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
|
|
|
|
MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
Wait.VmCnt = 0;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Separate R600 and GCN TableGen files
Summary:
We now have two sets of generated TableGen files, one for R600 and one
for GCN, so each sub-target now has its own tables of instructions,
registers, ISel patterns, etc. This should help reduce compile time
since each sub-target now only has to consider information that
is specific to itself. This will also help prevent the R600
sub-target from slowing down new features for GCN, like disassembler
support, GlobalISel, etc.
Reviewers: arsenm, nhaehnle, jvesely
Reviewed By: arsenm
Subscribers: MatzeB, kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D46365
llvm-svn: 335942
2018-06-29 07:47:12 +08:00
|
|
|
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
|
2019-06-15 05:52:26 +08:00
|
|
|
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
|
2020-09-04 18:36:29 +08:00
|
|
|
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry())) {
|
|
|
|
// All waits must be resolved at call return.
|
|
|
|
// NOTE: this could be improved with knowledge of all call sites or
|
|
|
|
// with knowledge of the called routines.
|
|
|
|
if (callWaitsOnFunctionReturn(MI))
|
|
|
|
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
|
|
|
|
} else if (isSendGsDoneMessage(MI)) {
|
|
|
|
// Resolve vm waits before gs-done.
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
Wait.VmCnt = 0;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
#if 0 // TODO: the following blocks of logic when we have fence.
|
|
|
|
else if (MI.getOpcode() == SC_FENCE) {
|
|
|
|
const unsigned int group_size =
|
|
|
|
context->shader_info->GetMaxThreadGroupSize();
|
|
|
|
// group_size == 0 means thread group size is unknown at compile time
|
|
|
|
const bool group_is_multi_wave =
|
|
|
|
(group_size == 0 || group_size > target_info->GetWaveFrontSize());
|
|
|
|
const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
|
|
|
|
SCRegType src_type = Inst->GetSrcType(i);
|
|
|
|
switch (src_type) {
|
|
|
|
case SCMEM_LDS:
|
|
|
|
if (group_is_multi_wave ||
|
2017-08-17 00:47:29 +08:00
|
|
|
context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
|
2017-04-12 11:25:12 +08:00
|
|
|
ScoreBrackets->getScoreUB(LGKM_CNT));
|
|
|
|
// LDS may have to wait for VM_CNT after buffer load to LDS
|
|
|
|
if (target_info->HasBufferLoadToLDS()) {
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
|
2017-04-12 11:25:12 +08:00
|
|
|
ScoreBrackets->getScoreUB(VM_CNT));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCMEM_GDS:
|
|
|
|
if (group_is_multi_wave || fence_is_global) {
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
|
2017-08-17 00:47:29 +08:00
|
|
|
ScoreBrackets->getScoreUB(EXP_CNT));
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
|
2017-08-17 00:47:29 +08:00
|
|
|
ScoreBrackets->getScoreUB(LGKM_CNT));
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCMEM_UAV:
|
|
|
|
case SCMEM_TFBUF:
|
|
|
|
case SCMEM_RING:
|
|
|
|
case SCMEM_SCATTER:
|
|
|
|
if (group_is_multi_wave || fence_is_global) {
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
|
2017-08-17 00:47:29 +08:00
|
|
|
ScoreBrackets->getScoreUB(EXP_CNT));
|
2018-04-24 23:59:59 +08:00
|
|
|
EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
|
2017-08-17 00:47:29 +08:00
|
|
|
ScoreBrackets->getScoreUB(VM_CNT));
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCMEM_SCRATCH:
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Export & GDS instructions do not read the EXEC mask until after the export
|
|
|
|
// is granted (which can occur well after the instruction is issued).
|
|
|
|
// The shader program must flush all EXP operations on the export-count
|
|
|
|
// before overwriting the EXEC mask.
|
|
|
|
else {
|
|
|
|
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
|
|
|
|
// Export and GDS are tracked individually, either may trigger a waitcnt
|
|
|
|
// for EXEC.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
|
|
|
|
ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
|
|
|
|
ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
|
|
|
|
ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
|
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.
This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).
This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54228
llvm-svn: 347850
2018-11-29 19:06:14 +08:00
|
|
|
Wait.ExpCnt = 0;
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2020-09-04 18:36:29 +08:00
|
|
|
if (MI.isCall() && callWaitsOnFunctionEntry()) {
|
AMDGPU: Avoid overwriting saved PC
Summary:
An outstanding load with same destination sgpr as call could cause PC to be
updated with junk value on return.
Reviewers: arsenm, rampitec
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69474
2019-10-29 00:39:20 +08:00
|
|
|
// The function is going to insert a wait on everything in its prolog.
|
|
|
|
// This still needs to be careful if the call target is a load (e.g. a GOT
|
|
|
|
// load). We also need to check WAW depenancy with saved PC.
|
2019-06-15 05:52:26 +08:00
|
|
|
Wait = AMDGPU::Waitcnt();
|
2017-07-22 02:54:54 +08:00
|
|
|
|
2019-06-15 05:52:26 +08:00
|
|
|
int CallAddrOpIdx =
|
|
|
|
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
|
2020-07-26 08:16:15 +08:00
|
|
|
|
|
|
|
if (MI.getOperand(CallAddrOpIdx).isReg()) {
|
|
|
|
RegInterval CallAddrOpInterval =
|
2020-04-28 03:14:52 +08:00
|
|
|
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
|
AMDGPU: Avoid overwriting saved PC
Summary:
An outstanding load with same destination sgpr as call could cause PC to be
updated with junk value on return.
Reviewers: arsenm, rampitec
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69474
2019-10-29 00:39:20 +08:00
|
|
|
|
2020-07-26 08:16:15 +08:00
|
|
|
for (int RegNo = CallAddrOpInterval.first;
|
|
|
|
RegNo < CallAddrOpInterval.second; ++RegNo)
|
|
|
|
ScoreBrackets.determineWait(
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
|
AMDGPU: Avoid overwriting saved PC
Summary:
An outstanding load with same destination sgpr as call could cause PC to be
updated with junk value on return.
Reviewers: arsenm, rampitec
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69474
2019-10-29 00:39:20 +08:00
|
|
|
|
2020-07-26 08:16:15 +08:00
|
|
|
int RtnAddrOpIdx =
|
|
|
|
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
|
|
|
|
if (RtnAddrOpIdx != -1) {
|
|
|
|
RegInterval RtnAddrOpInterval =
|
2020-04-28 03:14:52 +08:00
|
|
|
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
|
AMDGPU: Avoid overwriting saved PC
Summary:
An outstanding load with same destination sgpr as call could cause PC to be
updated with junk value on return.
Reviewers: arsenm, rampitec
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69474
2019-10-29 00:39:20 +08:00
|
|
|
|
2020-07-26 08:16:15 +08:00
|
|
|
for (int RegNo = RtnAddrOpInterval.first;
|
|
|
|
RegNo < RtnAddrOpInterval.second; ++RegNo)
|
|
|
|
ScoreBrackets.determineWait(
|
AMDGPU: Avoid overwriting saved PC
Summary:
An outstanding load with same destination sgpr as call could cause PC to be
updated with junk value on return.
Reviewers: arsenm, rampitec
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69474
2019-10-29 00:39:20 +08:00
|
|
|
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
|
2020-07-26 08:16:15 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
2019-06-15 05:52:26 +08:00
|
|
|
} else {
|
2017-07-22 02:54:54 +08:00
|
|
|
// FIXME: Should not be relying on memoperands.
|
2019-06-15 05:52:26 +08:00
|
|
|
// Look at the source operands of every instruction to see if
|
|
|
|
// any of them results from a previous memory operation that affects
|
|
|
|
// its current usage. If so, an s_waitcnt instruction needs to be
|
|
|
|
// emitted.
|
|
|
|
// If the source operand was defined by a load, add the s_waitcnt
|
|
|
|
// instruction.
|
2020-04-28 15:54:19 +08:00
|
|
|
//
|
|
|
|
// Two cases are handled for destination operands:
|
|
|
|
// 1) If the destination operand was defined by a load, add the s_waitcnt
|
|
|
|
// instruction to guarantee the right WAW order.
|
|
|
|
// 2) If a destination operand that was used by a recent export/store ins,
|
|
|
|
// add s_waitcnt on exp_cnt to guarantee the WAR order.
|
2017-04-12 11:25:12 +08:00
|
|
|
for (const MachineMemOperand *Memop : MI.memoperands()) {
|
2020-04-28 15:54:19 +08:00
|
|
|
const Value *Ptr = Memop->getValue();
|
|
|
|
if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
|
|
|
|
addWait(Wait, LGKM_CNT, 0);
|
|
|
|
if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
|
|
|
|
SLoadAddresses.erase(Ptr);
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
unsigned AS = Memop->getAddrSpace();
|
2018-08-31 13:49:54 +08:00
|
|
|
if (AS != AMDGPUAS::LOCAL_ADDRESS)
|
2017-04-12 11:25:12 +08:00
|
|
|
continue;
|
|
|
|
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
|
2019-06-15 05:52:26 +08:00
|
|
|
// VM_CNT is only relevant to vgpr or LDS.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
ScoreBrackets.determineWait(
|
|
|
|
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
|
2020-04-28 15:54:19 +08:00
|
|
|
if (Memop->isStore()) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
ScoreBrackets.determineWait(
|
|
|
|
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
2020-04-28 03:14:52 +08:00
|
|
|
|
2020-04-28 15:54:19 +08:00
|
|
|
// Loop over use and def operands.
|
2019-06-15 05:52:26 +08:00
|
|
|
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
|
2020-04-28 15:54:19 +08:00
|
|
|
MachineOperand &Op = MI.getOperand(I);
|
|
|
|
if (!Op.isReg())
|
2020-04-28 03:14:52 +08:00
|
|
|
continue;
|
2019-06-15 05:52:26 +08:00
|
|
|
RegInterval Interval =
|
2020-04-28 03:14:52 +08:00
|
|
|
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
|
2020-09-10 04:58:52 +08:00
|
|
|
|
|
|
|
const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
|
2020-04-30 22:05:40 +08:00
|
|
|
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
|
2020-09-10 04:58:52 +08:00
|
|
|
if (IsVGPR) {
|
2020-04-30 17:51:09 +08:00
|
|
|
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
|
|
|
|
// previous write and this write are the same type of VMEM
|
|
|
|
// instruction, in which case they're guaranteed to write their
|
|
|
|
// results in order anyway.
|
|
|
|
if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
|
|
|
|
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
|
|
|
|
getVmemType(MI))) {
|
|
|
|
ScoreBrackets.determineWait(
|
|
|
|
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
|
|
|
|
ScoreBrackets.clearVgprVmemTypes(RegNo);
|
|
|
|
}
|
2020-04-28 15:54:19 +08:00
|
|
|
if (Op.isDef()) {
|
|
|
|
ScoreBrackets.determineWait(
|
|
|
|
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
|
|
|
|
}
|
2019-06-15 05:52:26 +08:00
|
|
|
}
|
|
|
|
ScoreBrackets.determineWait(
|
|
|
|
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
|
|
|
|
}
|
2020-04-28 15:54:19 +08:00
|
|
|
}
|
2019-06-15 05:52:26 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
|
|
|
|
// occurs before the instruction. Doing it here prevents any additional
|
|
|
|
// S_WAITCNTs from being emitted if the instruction was marked as
|
|
|
|
// requiring a WAITCNT beforehand.
|
2017-06-03 01:40:26 +08:00
|
|
|
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
|
|
|
|
!ST->hasAutoWaitcntBeforeBarrier()) {
|
2020-08-26 02:58:23 +08:00
|
|
|
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: Remove this work-around, enable the assert for Bug 457939
|
|
|
|
// after fixing the scheduler. Also, the Shader Compiler code is
|
|
|
|
// independent of target.
|
2019-06-20 07:54:58 +08:00
|
|
|
if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
|
|
|
|
ScoreBrackets.getScoreUB(LGKM_CNT) &&
|
|
|
|
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
Wait.LgkmCnt = 0;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
// Early-out if no wait is indicated.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
|
|
|
|
bool Modified = false;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
if (OldWaitcntInstr) {
|
2019-05-04 05:53:53 +08:00
|
|
|
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
|
|
|
|
&*II != &MI; II = NextI, ++NextI) {
|
|
|
|
if (II->isDebugInstr())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (TrackedWaitcntSet.count(&*II)) {
|
|
|
|
TrackedWaitcntSet.erase(&*II);
|
|
|
|
II->eraseFromParent();
|
|
|
|
Modified = true;
|
|
|
|
} else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
|
|
|
|
int64_t Imm = II->getOperand(0).getImm();
|
|
|
|
ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
|
|
|
|
} else {
|
|
|
|
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
|
|
|
|
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
|
2020-08-26 03:13:24 +08:00
|
|
|
auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
|
|
|
|
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
|
2019-05-04 05:53:53 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
return Modified;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
if (ForceEmitZeroWaitcnts)
|
2020-08-26 02:58:23 +08:00
|
|
|
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
if (ForceEmitWaitcnt[VM_CNT])
|
|
|
|
Wait.VmCnt = 0;
|
|
|
|
if (ForceEmitWaitcnt[EXP_CNT])
|
|
|
|
Wait.ExpCnt = 0;
|
|
|
|
if (ForceEmitWaitcnt[LGKM_CNT])
|
|
|
|
Wait.LgkmCnt = 0;
|
2019-05-04 05:53:53 +08:00
|
|
|
if (ForceEmitWaitcnt[VS_CNT])
|
|
|
|
Wait.VsCnt = 0;
|
2018-02-16 06:03:55 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
ScoreBrackets.applyWaitcnt(Wait);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
AMDGPU::Waitcnt OldWait;
|
2019-05-04 05:53:53 +08:00
|
|
|
bool Modified = false;
|
|
|
|
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
if (OldWaitcntInstr) {
|
2019-05-04 05:53:53 +08:00
|
|
|
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
|
|
|
|
&*II != &MI; II = NextI, NextI++) {
|
|
|
|
if (II->isDebugInstr())
|
|
|
|
continue;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
|
|
|
|
unsigned IEnc = II->getOperand(0).getImm();
|
|
|
|
AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
|
|
|
|
OldWait = OldWait.combined(IWait);
|
|
|
|
if (!TrackedWaitcntSet.count(&*II))
|
|
|
|
Wait = Wait.combined(IWait);
|
|
|
|
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
|
|
|
|
if (IEnc != NewEnc) {
|
|
|
|
II->getOperand(0).setImm(NewEnc);
|
|
|
|
Modified = true;
|
|
|
|
}
|
|
|
|
Wait.VmCnt = ~0u;
|
|
|
|
Wait.LgkmCnt = ~0u;
|
|
|
|
Wait.ExpCnt = ~0u;
|
|
|
|
} else {
|
|
|
|
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
|
|
|
|
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
|
|
|
|
|
2020-08-26 03:13:24 +08:00
|
|
|
unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
|
|
|
|
->getImm();
|
2019-05-04 05:53:53 +08:00
|
|
|
OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
|
|
|
|
if (!TrackedWaitcntSet.count(&*II))
|
|
|
|
Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
|
|
|
|
if (Wait.VsCnt != ICnt) {
|
2020-08-26 03:13:24 +08:00
|
|
|
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
|
2019-05-04 05:53:53 +08:00
|
|
|
Modified = true;
|
|
|
|
}
|
|
|
|
Wait.VsCnt = ~0u;
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
|
2019-11-25 23:21:18 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
2020-04-28 00:10:30 +08:00
|
|
|
<< "Old Instr: " << MI
|
2019-05-04 05:53:53 +08:00
|
|
|
<< "New Instr: " << *II << '\n');
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
if (!Wait.hasWait())
|
|
|
|
return Modified;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
|
|
|
|
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
|
|
|
|
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
|
|
|
|
.addImm(Enc);
|
|
|
|
TrackedWaitcntSet.insert(SWaitInst);
|
2019-05-04 05:53:53 +08:00
|
|
|
Modified = true;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
|
2019-11-25 23:21:18 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
2020-04-28 00:10:30 +08:00
|
|
|
<< "Old Instr: " << MI
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
<< "New Instr: " << *SWaitInst << '\n');
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
if (Wait.VsCnt != ~0u) {
|
|
|
|
assert(ST->hasVscnt());
|
|
|
|
|
|
|
|
auto SWaitInst =
|
|
|
|
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
|
|
|
|
TII->get(AMDGPU::S_WAITCNT_VSCNT))
|
|
|
|
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
|
|
|
|
.addImm(Wait.VsCnt);
|
|
|
|
TrackedWaitcntSet.insert(SWaitInst);
|
|
|
|
Modified = true;
|
|
|
|
|
2019-11-25 23:21:18 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
2020-04-28 00:10:30 +08:00
|
|
|
<< "Old Instr: " << MI
|
2019-05-04 05:53:53 +08:00
|
|
|
<< "New Instr: " << *SWaitInst << '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
return Modified;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2020-09-04 18:36:29 +08:00
|
|
|
bool SIInsertWaitcnts::generateWaitcntInstAfter(MachineInstr &MI,
|
|
|
|
WaitcntBrackets &ScoreBrackets,
|
|
|
|
MachineInstr *OldWaitcntInstr) {
|
|
|
|
// Insert waitcnts after function calls (that are not tail calls)
|
|
|
|
if (!MI.isCall() || MI.isTerminator() || callWaitsOnFunctionReturn(MI))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto I = ++MI.getIterator();
|
|
|
|
// Don't insert waitcnt if this function returns immediately
|
|
|
|
if (I.isEnd() || I->isReturn())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
BuildMI(*MI.getParent(), I, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(0);
|
|
|
|
if (ST->hasVscnt())
|
|
|
|
BuildMI(*MI.getParent(), I, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
|
|
|
|
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
|
|
|
|
.addImm(0);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-07-22 02:54:54 +08:00
|
|
|
// This is a flat memory operation. Check to see if it has memory
|
|
|
|
// tokens for both LDS and Memory, and if so mark it as a flat.
|
|
|
|
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
|
|
|
|
if (MI.memoperands_empty())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
for (const MachineMemOperand *Memop : MI.memoperands()) {
|
|
|
|
unsigned AS = Memop->getAddrSpace();
|
2018-08-31 13:49:54 +08:00
|
|
|
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
|
2017-07-22 02:54:54 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
|
|
|
|
WaitcntBrackets *ScoreBrackets) {
|
2017-04-12 11:25:12 +08:00
|
|
|
// Now look at the instruction opcode. If it is a memory access
|
|
|
|
// instruction, update the upper-bound of the appropriate counter's
|
|
|
|
// bracket and the destination operand scores.
|
|
|
|
// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
|
2017-07-22 02:34:51 +08:00
|
|
|
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
|
2019-01-16 23:43:53 +08:00
|
|
|
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
|
|
|
|
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
|
2017-04-12 11:25:12 +08:00
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
|
|
|
|
} else {
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
|
|
|
|
}
|
|
|
|
} else if (TII->isFLAT(Inst)) {
|
2019-12-19 19:19:36 +08:00
|
|
|
assert(Inst.mayLoadOrStore());
|
2017-07-22 02:34:51 +08:00
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
if (TII->usesVM_CNT(Inst)) {
|
|
|
|
if (!ST->hasVscnt())
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
|
|
|
|
else if (Inst.mayLoad() &&
|
|
|
|
AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
|
|
|
|
else
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
|
|
|
|
}
|
2017-07-22 02:34:51 +08:00
|
|
|
|
2017-07-22 02:54:54 +08:00
|
|
|
if (TII->usesLGKM_CNT(Inst)) {
|
2017-07-22 02:34:51 +08:00
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2017-07-22 02:54:54 +08:00
|
|
|
// This is a flat memory operation, so note it - it will require
|
|
|
|
// that both the VM and LGKM be flushed to zero if it is pending when
|
|
|
|
// a VM or LGKM dependency occurs.
|
|
|
|
if (mayAccessLDSThroughFlat(Inst))
|
|
|
|
ScoreBrackets->setPendingFlat();
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
} else if (SIInstrInfo::isVMEM(Inst) &&
|
|
|
|
// TODO: get a better carve out.
|
|
|
|
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
|
|
|
|
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
|
2019-05-04 05:53:53 +08:00
|
|
|
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
|
|
|
|
Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
|
|
|
|
Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
|
|
|
|
if (!ST->hasVscnt())
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
|
|
|
|
else if ((Inst.mayLoad() &&
|
|
|
|
AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
|
|
|
|
/* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
|
|
|
|
(TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
|
|
|
|
else if (Inst.mayStore())
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
|
|
|
|
|
2018-04-27 00:11:19 +08:00
|
|
|
if (ST->vmemWriteNeedsExpWaitcnt() &&
|
2017-06-01 00:44:23 +08:00
|
|
|
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
|
2017-04-12 11:25:12 +08:00
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
|
|
|
|
}
|
|
|
|
} else if (TII->isSMRD(Inst)) {
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
|
2019-06-15 05:52:26 +08:00
|
|
|
} else if (Inst.isCall()) {
|
2020-09-04 18:36:29 +08:00
|
|
|
// Act as a wait on everything. Either the callee waits or we insert a wait.
|
|
|
|
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
|
2017-04-12 11:25:12 +08:00
|
|
|
} else {
|
|
|
|
switch (Inst.getOpcode()) {
|
|
|
|
case AMDGPU::S_SENDMSG:
|
|
|
|
case AMDGPU::S_SENDMSGHALT:
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
|
|
|
|
break;
|
|
|
|
case AMDGPU::EXP:
|
|
|
|
case AMDGPU::EXP_DONE: {
|
|
|
|
int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
|
|
|
|
if (Imm >= 32 && Imm <= 63)
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
|
|
|
|
else if (Imm >= 12 && Imm <= 15)
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
|
|
|
|
else
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case AMDGPU::S_MEMTIME:
|
|
|
|
case AMDGPU::S_MEMREALTIME:
|
|
|
|
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-30 22:05:40 +08:00
|
|
|
bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
|
|
|
|
unsigned OtherScore) {
|
|
|
|
unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
|
|
|
|
unsigned OtherShifted =
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
|
|
|
|
Score = std::max(MyShifted, OtherShifted);
|
|
|
|
return OtherShifted > MyShifted;
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
/// Merge the pending events and associater score brackets of \p Other into
|
|
|
|
/// this brackets status.
|
|
|
|
///
|
|
|
|
/// Returns whether the merge resulted in a change that requires tighter waits
|
|
|
|
/// (i.e. the merged brackets strictly dominate the original brackets).
|
|
|
|
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
|
|
|
|
bool StrictDom = false;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-29 21:10:56 +08:00
|
|
|
VgprUB = std::max(VgprUB, Other.VgprUB);
|
|
|
|
SgprUB = std::max(SgprUB, Other.SgprUB);
|
|
|
|
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
for (auto T : inst_counter_types()) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
// Merge event flags for this counter
|
|
|
|
const bool OldOutOfOrder = counterOutOfOrder(T);
|
2020-04-30 22:05:40 +08:00
|
|
|
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
|
|
|
|
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (OtherEvents & ~OldEvents)
|
|
|
|
StrictDom = true;
|
|
|
|
PendingEvents |= OtherEvents;
|
|
|
|
|
|
|
|
// Merge scores for this counter
|
2020-04-30 22:05:40 +08:00
|
|
|
const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
|
|
|
|
const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
|
|
|
|
const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
|
2020-04-29 23:58:07 +08:00
|
|
|
if (NewUB < ScoreLBs[T])
|
|
|
|
report_fatal_error("waitcnt score overflow");
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
MergeInfo M;
|
|
|
|
M.OldLB = ScoreLBs[T];
|
|
|
|
M.OtherLB = Other.ScoreLBs[T];
|
2020-04-29 23:58:07 +08:00
|
|
|
M.MyShift = NewUB - ScoreUBs[T];
|
|
|
|
M.OtherShift = NewUB - Other.ScoreUBs[T];
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
|
|
|
|
ScoreUBs[T] = NewUB;
|
|
|
|
|
|
|
|
StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
|
|
|
|
|
|
|
|
bool RegStrictDom = false;
|
2020-04-29 21:10:56 +08:00
|
|
|
for (int J = 0; J <= VgprUB; J++) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 17:51:09 +08:00
|
|
|
if (T == VM_CNT) {
|
|
|
|
for (int J = 0; J <= VgprUB; J++) {
|
|
|
|
unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
|
|
|
|
RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
|
|
|
|
VgprVmemTypes[J] = NewVmemTypes;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (T == LGKM_CNT) {
|
2020-04-29 21:10:56 +08:00
|
|
|
for (int J = 0; J <= SgprUB; J++) {
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (RegStrictDom && !OldOutOfOrder)
|
|
|
|
StrictDom = true;
|
2018-04-19 23:42:30 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
return StrictDom;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Generate s_waitcnt instructions where needed.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|
|
|
MachineBasicBlock &Block,
|
|
|
|
WaitcntBrackets &ScoreBrackets) {
|
|
|
|
bool Modified = false;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2018-04-26 03:21:26 +08:00
|
|
|
dbgs() << "*** Block" << Block.getNumber() << " ***";
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
ScoreBrackets.dump();
|
2017-04-12 11:25:12 +08:00
|
|
|
});
|
|
|
|
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
// Assume VCCZ is correct at basic block boundaries, unless and until we need
|
|
|
|
// to handle cases where that is not true.
|
|
|
|
bool VCCZCorrect = true;
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// Walk over the instructions.
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
MachineInstr *OldWaitcntInstr = nullptr;
|
|
|
|
|
2019-07-03 08:30:44 +08:00
|
|
|
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
|
|
|
|
E = Block.instr_end();
|
2017-04-12 11:25:12 +08:00
|
|
|
Iter != E;) {
|
|
|
|
MachineInstr &Inst = *Iter;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
|
2019-05-04 05:53:53 +08:00
|
|
|
// Track pre-existing waitcnts from earlier iterations.
|
|
|
|
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
|
|
|
|
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
|
|
|
|
Inst.getOperand(0).isReg() &&
|
|
|
|
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
|
|
|
|
if (!OldWaitcntInstr)
|
|
|
|
OldWaitcntInstr = &Inst;
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
++Iter;
|
2017-04-12 11:25:12 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
// We might need to restore vccz to its correct value for either of two
|
|
|
|
// different reasons; see ST->hasReadVCCZBug() and
|
|
|
|
// ST->partialVCCWritesUpdateVCCZ().
|
|
|
|
bool RestoreVCCZ = false;
|
[AMDGPU] Simplify VCCZ bug handling
Summary:
VCCZBugHandledSet was used to make sure we don't apply the same
workaround more than once to a single cbranch instruction, but it's not
necessary because the workaround involves inserting an s_waitcnt
instruction, which is enough for subsequent iterations to detect that no
further workaround is necessary.
Also beef up the test case to check that the workaround was only applied
once. I have also manually verified that the test still passes even if I
hack the big do-while loop in runOnMachineFunction to run a minimum of
five iterations.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69621
2019-10-30 21:47:32 +08:00
|
|
|
if (readsVCCZ(Inst)) {
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
if (!VCCZCorrect)
|
|
|
|
RestoreVCCZ = true;
|
|
|
|
else if (ST->hasReadVCCZBug()) {
|
|
|
|
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
|
|
|
|
// vccz bit, so when we detect that an instruction may read from a
|
|
|
|
// corrupt vccz bit, we need to:
|
|
|
|
// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
|
|
|
|
// operations to complete.
|
|
|
|
// 2. Restore the correct value of vccz by writing the current value
|
|
|
|
// of vcc back to vcc.
|
|
|
|
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
|
|
|
|
ScoreBrackets.getScoreUB(LGKM_CNT) &&
|
|
|
|
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
|
|
|
|
RestoreVCCZ = true;
|
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-04 23:23:14 +08:00
|
|
|
if (TII->isSMRD(Inst)) {
|
|
|
|
for (const MachineMemOperand *Memop : Inst.memoperands()) {
|
|
|
|
const Value *Ptr = Memop->getValue();
|
|
|
|
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
if (!ST->partialVCCWritesUpdateVCCZ()) {
|
|
|
|
// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
|
|
|
|
// Writes to vcc will fix it.
|
|
|
|
if (Inst.definesRegister(AMDGPU::VCC_LO) ||
|
|
|
|
Inst.definesRegister(AMDGPU::VCC_HI))
|
|
|
|
VCCZCorrect = false;
|
|
|
|
else if (Inst.definesRegister(AMDGPU::VCC))
|
|
|
|
VCCZCorrect = true;
|
|
|
|
}
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
// Generate an s_waitcnt instruction to be placed before
|
|
|
|
// cur_Inst, if needed.
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
|
2020-09-04 18:36:29 +08:00
|
|
|
Modified |= generateWaitcntInstAfter(Inst, ScoreBrackets, OldWaitcntInstr);
|
AMDGPU/InsertWaitcnts: Untangle some semi-global state
Summary:
Reduce the statefulness of the algorithm in two ways:
1. More clearly split generateWaitcntInstBefore into two phases: the
first one which determines the required wait, if any, without changing
the ScoreBrackets, and the second one which actually inserts the wait
and updates the brackets.
2. Communicate pre-existing s_waitcnt instructions using an argument to
generateWaitcntInstBefore instead of through the ScoreBrackets.
To simplify these changes, a Waitcnt structure is introduced which carries
the counts of an s_waitcnt instruction in decoded form.
There are some functional changes:
1. The FIXME for the VCCZ bug workaround was implemented: we only wait for
SMEM instructions as required instead of waiting on all counters.
2. We now properly track pre-existing waitcnt's in all cases, which leads
to less conservative waitcnts being emitted in some cases.
s_load_dword ...
s_waitcnt lgkmcnt(0) <-- pre-existing wait count
ds_read_b32 v0, ...
ds_read_b32 v1, ...
s_waitcnt lgkmcnt(0) <-- this is too conservative
use(v0)
more code
use(v1)
This increases code size a bit, but the reduced latency should still be a
win in basically all cases. The worst code size regressions in my shader-db
are:
WORST REGRESSIONS - Code Size
Before After Delta Percentage
1724 1736 12 0.70 % shaders/private/f1-2015/1334.shader_test [0]
2276 2284 8 0.35 % shaders/private/f1-2015/1306.shader_test [0]
4632 4640 8 0.17 % shaders/private/ue4_elemental/62.shader_test [0]
2376 2384 8 0.34 % shaders/private/f1-2015/1308.shader_test [0]
3284 3292 8 0.24 % shaders/private/talos_principle/1955.shader_test [0]
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54226
llvm-svn: 347848
2018-11-29 19:06:06 +08:00
|
|
|
OldWaitcntInstr = nullptr;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
updateEventWaitcntAfter(Inst, &ScoreBrackets);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
|
|
|
|
// If this instruction generates a S_SETVSKIP because it is an
|
|
|
|
// indexed resource, and we are on Tahiti, then it will also force
|
|
|
|
// an S_WAITCNT vmcnt(0)
|
|
|
|
if (RequireCheckResourceType(Inst, context)) {
|
|
|
|
// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
|
|
|
|
ScoreBrackets->setScoreLB(VM_CNT,
|
2017-08-17 00:47:29 +08:00
|
|
|
ScoreBrackets->getScoreUB(VM_CNT));
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
2018-01-31 01:17:06 +08:00
|
|
|
Inst.print(dbgs());
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
ScoreBrackets.dump();
|
2017-04-12 11:25:12 +08:00
|
|
|
});
|
|
|
|
|
|
|
|
// TODO: Remove this work-around after fixing the scheduler and enable the
|
|
|
|
// assert above.
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
if (RestoreVCCZ) {
|
2017-04-12 11:25:12 +08:00
|
|
|
// Restore the vccz bit. Any time a value is written to vcc, the vcc
|
|
|
|
// bit is updated, so we can restore the bit by reading the value of
|
|
|
|
// vcc and then writing it back to the register.
|
2019-05-04 05:53:53 +08:00
|
|
|
BuildMI(Block, Inst, Inst.getDebugLoc(),
|
2019-06-17 01:13:09 +08:00
|
|
|
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
|
|
|
|
TRI->getVCC())
|
|
|
|
.addReg(TRI->getVCC());
|
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69661
2019-10-30 20:18:51 +08:00
|
|
|
VCCZCorrect = true;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
Modified = true;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
++Iter;
|
|
|
|
}
|
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
return Modified;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
|
2018-07-12 04:59:01 +08:00
|
|
|
ST = &MF.getSubtarget<GCNSubtarget>();
|
2017-04-12 11:25:12 +08:00
|
|
|
TII = ST->getInstrInfo();
|
|
|
|
TRI = &TII->getRegisterInfo();
|
|
|
|
MRI = &MF.getRegInfo();
|
2018-09-13 02:50:47 +08:00
|
|
|
IV = AMDGPU::getIsaVersion(ST->getCPU());
|
2017-06-01 00:44:23 +08:00
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2020-01-04 23:23:14 +08:00
|
|
|
PDT = &getAnalysis<MachinePostDominatorTree>();
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-05-07 22:43:28 +08:00
|
|
|
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
|
AMDGPU/InsertWaitcnts: Use foreach loops for inst and wait event types
Summary:
It hides the type casting ugliness, and I happened to have to add a new
such loop (in a later patch).
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54227
llvm-svn: 347849
2018-11-29 19:06:11 +08:00
|
|
|
for (auto T : inst_counter_types())
|
2018-04-26 03:21:26 +08:00
|
|
|
ForceEmitWaitcnt[T] = false;
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
|
|
|
|
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
|
|
|
|
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
|
2019-05-04 05:53:53 +08:00
|
|
|
HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2020-04-29 21:10:56 +08:00
|
|
|
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
|
|
|
|
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
|
|
|
|
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
|
|
|
|
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
|
2020-04-29 21:10:56 +08:00
|
|
|
RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
|
2017-04-12 11:25:12 +08:00
|
|
|
RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
|
2020-04-29 21:10:56 +08:00
|
|
|
RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
|
2017-04-12 11:25:12 +08:00
|
|
|
|
2018-02-07 10:21:21 +08:00
|
|
|
TrackedWaitcntSet.clear();
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
BlockInfos.clear();
|
|
|
|
|
|
|
|
// Keep iterating over the blocks in reverse post order, inserting and
|
|
|
|
// updating s_waitcnt where needed, until a fix point is reached.
|
2020-04-29 22:46:33 +08:00
|
|
|
for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
|
|
|
|
BlockInfos.insert({MBB, BlockInfo(MBB)});
|
2018-02-07 10:21:21 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
std::unique_ptr<WaitcntBrackets> Brackets;
|
2017-04-12 11:25:12 +08:00
|
|
|
bool Modified = false;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
bool Repeat;
|
|
|
|
do {
|
|
|
|
Repeat = false;
|
|
|
|
|
2020-04-29 22:46:33 +08:00
|
|
|
for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
|
|
|
|
++BII) {
|
|
|
|
BlockInfo &BI = BII->second;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (!BI.Dirty)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (BI.Incoming) {
|
|
|
|
if (!Brackets)
|
2019-08-15 23:54:37 +08:00
|
|
|
Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
else
|
|
|
|
*Brackets = *BI.Incoming;
|
|
|
|
} else {
|
|
|
|
if (!Brackets)
|
2019-08-15 23:54:37 +08:00
|
|
|
Brackets = std::make_unique<WaitcntBrackets>(ST);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
else
|
2020-04-29 21:10:56 +08:00
|
|
|
*Brackets = WaitcntBrackets(ST);
|
2018-04-19 23:42:30 +08:00
|
|
|
}
|
2017-04-12 11:25:12 +08:00
|
|
|
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
|
|
|
|
BI.Dirty = false;
|
|
|
|
|
|
|
|
if (Brackets->hasPending()) {
|
|
|
|
BlockInfo *MoveBracketsToSucc = nullptr;
|
|
|
|
for (MachineBasicBlock *Succ : BI.MBB->successors()) {
|
2020-04-29 22:46:33 +08:00
|
|
|
auto SuccBII = BlockInfos.find(Succ);
|
|
|
|
BlockInfo &SuccBI = SuccBII->second;
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (!SuccBI.Incoming) {
|
|
|
|
SuccBI.Dirty = true;
|
2020-04-29 22:46:33 +08:00
|
|
|
if (SuccBII <= BII)
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
Repeat = true;
|
|
|
|
if (!MoveBracketsToSucc) {
|
|
|
|
MoveBracketsToSucc = &SuccBI;
|
|
|
|
} else {
|
2019-08-15 23:54:37 +08:00
|
|
|
SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
}
|
|
|
|
} else if (SuccBI.Incoming->merge(*Brackets)) {
|
|
|
|
SuccBI.Dirty = true;
|
2020-04-29 22:46:33 +08:00
|
|
|
if (SuccBII <= BII)
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
Repeat = true;
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
if (MoveBracketsToSucc)
|
|
|
|
MoveBracketsToSucc->Incoming = std::move(Brackets);
|
2017-04-12 11:25:12 +08:00
|
|
|
}
|
|
|
|
}
|
AMDGPU/InsertWaitcnts: Remove the dependence on MachineLoopInfo
Summary:
MachineLoopInfo cannot be relied on for correctness, because it cannot
properly recognize loops in irreducible control flow which can be
introduced by late machine basic block optimization passes. See the new
test case for the reduced form of an example that occurred in practice.
Use a simple fixpoint iteration instead.
In order to facilitate this change, refactor WaitcntBrackets so that it
only tracks pending events and registers, rather than also maintaining
state that is relevant for the high-level algorithm. Various accessor
methods can be removed or made private as a consequence.
Affects (in radv):
- dEQP-VK.glsl.loops.special.{for,while}_uniform_iterations.select_iteration_count_{fragment,vertex}
Fixes: r345719 ("AMDGPU: Rewrite SILowerI1Copies to always stay on SALU")
Reviewers: msearles, rampitec, scott.linder, kanarayan
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam
Differential Revision: https://reviews.llvm.org/D54231
llvm-svn: 347853
2018-11-29 19:06:26 +08:00
|
|
|
} while (Repeat);
|
2017-04-12 11:25:12 +08:00
|
|
|
|
|
|
|
SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
|
|
|
|
|
|
|
|
bool HaveScalarStores = false;
|
|
|
|
|
|
|
|
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
|
|
|
|
++BI) {
|
|
|
|
MachineBasicBlock &MBB = *BI;
|
|
|
|
|
|
|
|
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
|
|
|
|
++I) {
|
|
|
|
if (!HaveScalarStores && TII->isScalarStore(*I))
|
|
|
|
HaveScalarStores = true;
|
|
|
|
|
|
|
|
if (I->getOpcode() == AMDGPU::S_ENDPGM ||
|
|
|
|
I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
|
|
|
|
EndPgmBlocks.push_back(&MBB);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (HaveScalarStores) {
|
|
|
|
// If scalar writes are used, the cache must be flushed or else the next
|
|
|
|
// wave to reuse the same scratch memory can be clobbered.
|
|
|
|
//
|
|
|
|
// Insert s_dcache_wb at wave termination points if there were any scalar
|
|
|
|
// stores, and only if the cache hasn't already been flushed. This could be
|
|
|
|
// improved by looking across blocks for flushes in postdominating blocks
|
|
|
|
// from the stores but an explicitly requested flush is probably very rare.
|
|
|
|
for (MachineBasicBlock *MBB : EndPgmBlocks) {
|
|
|
|
bool SeenDCacheWB = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
|
|
|
|
++I) {
|
|
|
|
if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
|
|
|
|
SeenDCacheWB = true;
|
|
|
|
else if (TII->isScalarStore(*I))
|
|
|
|
SeenDCacheWB = false;
|
|
|
|
|
|
|
|
// FIXME: It would be better to insert this before a waitcnt if any.
|
|
|
|
if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
|
|
|
|
I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
|
|
|
|
!SeenDCacheWB) {
|
|
|
|
Modified = true;
|
|
|
|
BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-04 18:36:29 +08:00
|
|
|
if (!MFI->isEntryFunction() && callWaitsOnFunctionEntry()) {
|
2017-06-01 00:44:23 +08:00
|
|
|
// Wait for any outstanding memory operations that the input registers may
|
2018-01-29 13:17:03 +08:00
|
|
|
// depend on. We can't track them and it's better to the wait after the
|
2017-06-01 00:44:23 +08:00
|
|
|
// costly call sequence.
|
|
|
|
|
|
|
|
// TODO: Could insert earlier and schedule more liberally with operations
|
|
|
|
// that only use caller preserved registers.
|
|
|
|
MachineBasicBlock &EntryBB = MF.front();
|
[AMDGPU] Skip CFIInstructions in SIInsertWaitcnts
Summary:
CFI emitted during PEI at the beginning of the prologue needs to apply
to any inserted waitcnts on function entry.
Reviewers: arsenm, t-tye, RamNalamothu
Reviewed By: arsenm
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm, #debug-info
Differential Revision: https://reviews.llvm.org/D76881
2020-06-18 00:38:34 +08:00
|
|
|
MachineBasicBlock::iterator I = EntryBB.begin();
|
|
|
|
for (MachineBasicBlock::iterator E = EntryBB.end();
|
|
|
|
I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
|
|
|
|
;
|
|
|
|
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
|
2019-05-04 05:53:53 +08:00
|
|
|
if (ST->hasVscnt())
|
[AMDGPU] Skip CFIInstructions in SIInsertWaitcnts
Summary:
CFI emitted during PEI at the beginning of the prologue needs to apply
to any inserted waitcnts on function entry.
Reviewers: arsenm, t-tye, RamNalamothu
Reviewed By: arsenm
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm, #debug-info
Differential Revision: https://reviews.llvm.org/D76881
2020-06-18 00:38:34 +08:00
|
|
|
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
|
|
|
|
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
|
|
|
|
.addImm(0);
|
2017-06-01 00:44:23 +08:00
|
|
|
|
|
|
|
Modified = true;
|
|
|
|
}
|
|
|
|
|
2017-04-12 11:25:12 +08:00
|
|
|
return Modified;
|
|
|
|
}
|