2018-03-08 21:05:02 +08:00
|
|
|
//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-03-08 21:05:02 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// A scheduler for processor resource units and processor resource groups.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2018-12-17 16:08:31 +08:00
|
|
|
#include "llvm/MCA/HardwareUnits/Scheduler.h"
|
2018-06-26 18:44:12 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2018-03-08 21:05:02 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
2018-10-30 23:56:08 +08:00
|
|
|
namespace llvm {
|
2018-03-08 21:05:02 +08:00
|
|
|
namespace mca {
|
|
|
|
|
2018-06-26 18:44:12 +08:00
|
|
|
#define DEBUG_TYPE "llvm-mca"
|
|
|
|
|
2018-08-24 02:42:37 +08:00
|
|
|
void Scheduler::initializeStrategy(std::unique_ptr<SchedulerStrategy> S) {
|
|
|
|
// Ensure we have a valid (non-null) strategy object.
|
|
|
|
Strategy = S ? std::move(S) : llvm::make_unique<DefaultSchedulerStrategy>();
|
|
|
|
}
|
|
|
|
|
2018-08-22 02:20:16 +08:00
|
|
|
// Anchor the vtable of SchedulerStrategy and DefaultSchedulerStrategy.
|
|
|
|
SchedulerStrategy::~SchedulerStrategy() = default;
|
|
|
|
DefaultSchedulerStrategy::~DefaultSchedulerStrategy() = default;
|
|
|
|
|
2018-03-08 21:05:02 +08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
void Scheduler::dump() const {
|
2018-08-03 20:55:28 +08:00
|
|
|
dbgs() << "[SCHEDULER]: WaitSet size is: " << WaitSet.size() << '\n';
|
|
|
|
dbgs() << "[SCHEDULER]: ReadySet size is: " << ReadySet.size() << '\n';
|
|
|
|
dbgs() << "[SCHEDULER]: IssuedSet size is: " << IssuedSet.size() << '\n';
|
2018-03-08 21:05:02 +08:00
|
|
|
Resources->dump();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-02-26 22:19:00 +08:00
|
|
|
Scheduler::Status Scheduler::isAvailable(const InstRef &IR) {
|
2018-05-08 02:29:15 +08:00
|
|
|
const InstrDesc &Desc = IR.getInstruction()->getDesc();
|
2018-08-22 02:20:16 +08:00
|
|
|
|
2019-02-26 22:19:00 +08:00
|
|
|
ResourceStateEvent RSE = Resources->canBeDispatched(Desc.Buffers);
|
|
|
|
HadTokenStall = RSE != RS_BUFFER_AVAILABLE;
|
|
|
|
|
|
|
|
switch (RSE) {
|
2018-08-17 23:01:37 +08:00
|
|
|
case ResourceStateEvent::RS_BUFFER_UNAVAILABLE:
|
2018-08-20 22:41:36 +08:00
|
|
|
return Scheduler::SC_BUFFERS_FULL;
|
2018-08-17 23:01:37 +08:00
|
|
|
case ResourceStateEvent::RS_RESERVED:
|
2018-08-20 22:41:36 +08:00
|
|
|
return Scheduler::SC_DISPATCH_GROUP_STALL;
|
|
|
|
case ResourceStateEvent::RS_BUFFER_AVAILABLE:
|
2018-08-17 23:01:37 +08:00
|
|
|
break;
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
2018-04-12 02:05:23 +08:00
|
|
|
|
2018-08-20 22:41:36 +08:00
|
|
|
// Give lower priority to LSUnit stall events.
|
2019-02-26 22:19:00 +08:00
|
|
|
LSUnit::Status LSS = LSU.isAvailable(IR);
|
|
|
|
HadTokenStall = LSS != LSUnit::LSU_AVAILABLE;
|
|
|
|
|
|
|
|
switch (LSS) {
|
2018-08-20 22:41:36 +08:00
|
|
|
case LSUnit::LSU_LQUEUE_FULL:
|
|
|
|
return Scheduler::SC_LOAD_QUEUE_FULL;
|
|
|
|
case LSUnit::LSU_SQUEUE_FULL:
|
|
|
|
return Scheduler::SC_STORE_QUEUE_FULL;
|
|
|
|
case LSUnit::LSU_AVAILABLE:
|
|
|
|
return Scheduler::SC_AVAILABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm_unreachable("Don't know how to process this LSU state result!");
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
|
|
|
|
2018-04-24 22:53:16 +08:00
|
|
|
void Scheduler::issueInstructionImpl(
|
2018-05-08 02:29:15 +08:00
|
|
|
InstRef &IR,
|
[llvm-mca] Delay calculation of Cycles per Resources, separate the cycles and resource quantities.
Summary:
This patch removes the storing of accumulated floating point data
within the llvm-mca library.
This patch splits-up the two quantities: cycles and number of resource units.
By splitting-up these two quantities, we delay the calculation of "cycles per resource unit"
until that value is read, reducing the chance of accumulating floating point error.
I considered using the APFloat, but after measuring performance, for a large (many iteration)
sample, I decided to go with this faster solution.
Reviewers: andreadb, courbet, RKSimon
Reviewed By: andreadb
Subscribers: llvm-commits, javed.absar, tschuett, gbedwell
Differential Revision: https://reviews.llvm.org/D51903
llvm-svn: 341980
2018-09-12 02:47:48 +08:00
|
|
|
SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &UsedResources) {
|
2018-05-08 02:29:15 +08:00
|
|
|
Instruction *IS = IR.getInstruction();
|
|
|
|
const InstrDesc &D = IS->getDesc();
|
2018-03-21 02:20:39 +08:00
|
|
|
|
2018-03-08 21:05:02 +08:00
|
|
|
// Issue the instruction and collect all the consumed resources
|
|
|
|
// into a vector. That vector is then used to notify the listener.
|
2018-04-27 06:30:40 +08:00
|
|
|
Resources->issueInstruction(D, UsedResources);
|
2018-04-24 22:53:16 +08:00
|
|
|
|
2018-03-08 21:05:02 +08:00
|
|
|
// Notify the instruction that it started executing.
|
|
|
|
// This updates the internal state of each write.
|
2019-02-18 19:27:11 +08:00
|
|
|
IS->execute(IR.getSourceIndex());
|
2018-03-08 21:05:02 +08:00
|
|
|
|
2018-05-08 02:29:15 +08:00
|
|
|
if (IS->isExecuting())
|
2018-08-03 20:55:28 +08:00
|
|
|
IssuedSet.emplace_back(IR);
|
2018-08-20 22:41:36 +08:00
|
|
|
else if (IS->isExecuted())
|
2018-11-30 20:49:30 +08:00
|
|
|
LSU.onInstructionExecuted(IR);
|
2018-04-24 22:53:16 +08:00
|
|
|
}
|
2018-03-22 18:19:20 +08:00
|
|
|
|
2018-06-14 09:20:18 +08:00
|
|
|
// Release the buffered resources and issue the instruction.
|
|
|
|
void Scheduler::issueInstruction(
|
[llvm-mca] Delay calculation of Cycles per Resources, separate the cycles and resource quantities.
Summary:
This patch removes the storing of accumulated floating point data
within the llvm-mca library.
This patch splits-up the two quantities: cycles and number of resource units.
By splitting-up these two quantities, we delay the calculation of "cycles per resource unit"
until that value is read, reducing the chance of accumulating floating point error.
I considered using the APFloat, but after measuring performance, for a large (many iteration)
sample, I decided to go with this faster solution.
Reviewers: andreadb, courbet, RKSimon
Reviewed By: andreadb
Subscribers: llvm-commits, javed.absar, tschuett, gbedwell
Differential Revision: https://reviews.llvm.org/D51903
llvm-svn: 341980
2018-09-12 02:47:48 +08:00
|
|
|
InstRef &IR,
|
|
|
|
SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &UsedResources,
|
2019-05-06 00:07:27 +08:00
|
|
|
SmallVectorImpl<InstRef> &PendingInstructions,
|
2018-08-21 20:40:15 +08:00
|
|
|
SmallVectorImpl<InstRef> &ReadyInstructions) {
|
|
|
|
const Instruction &Inst = *IR.getInstruction();
|
|
|
|
bool HasDependentUsers = Inst.hasDependentUsers();
|
|
|
|
|
|
|
|
Resources->releaseBuffers(Inst.getDesc().Buffers);
|
2018-06-14 09:20:18 +08:00
|
|
|
issueInstructionImpl(IR, UsedResources);
|
2018-08-21 20:40:15 +08:00
|
|
|
// Instructions that have been issued during this cycle might have unblocked
|
|
|
|
// other dependent instructions. Dependent instructions may be issued during
|
|
|
|
// this same cycle if operands have ReadAdvance entries. Promote those
|
|
|
|
// instructions to the ReadySet and notify the caller that those are ready.
|
2019-05-06 00:07:27 +08:00
|
|
|
if (HasDependentUsers && promoteToPendingSet(PendingInstructions))
|
2018-08-21 20:40:15 +08:00
|
|
|
promoteToReadySet(ReadyInstructions);
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
|
|
|
|
2019-02-13 19:02:42 +08:00
|
|
|
bool Scheduler::promoteToReadySet(SmallVectorImpl<InstRef> &Ready) {
|
2018-03-29 22:26:56 +08:00
|
|
|
// Scan the set of waiting instructions and promote them to the
|
2019-02-13 19:02:42 +08:00
|
|
|
// ready set if operands are all ready.
|
|
|
|
unsigned PromotedElements = 0;
|
|
|
|
for (auto I = PendingSet.begin(), E = PendingSet.end(); I != E;) {
|
2018-08-03 20:55:28 +08:00
|
|
|
InstRef &IR = *I;
|
2018-10-25 04:27:47 +08:00
|
|
|
if (!IR)
|
2018-08-03 20:55:28 +08:00
|
|
|
break;
|
2018-03-29 22:26:56 +08:00
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
// Check if there are still unsolved memory dependencies.
|
2018-08-03 20:55:28 +08:00
|
|
|
Instruction &IS = *IR.getInstruction();
|
2019-02-21 02:01:49 +08:00
|
|
|
if (IS.isMemOp()) {
|
2019-05-23 21:42:47 +08:00
|
|
|
const InstRef &CriticalMemDep = LSU.isReady(IR);
|
|
|
|
if (CriticalMemDep != IR) {
|
|
|
|
IS.setCriticalMemDep(CriticalMemDep.getSourceIndex());
|
2019-02-21 02:01:49 +08:00
|
|
|
++I;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2018-03-29 22:26:56 +08:00
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
// Check if this instruction is now ready. In case, force
|
|
|
|
// a transition in state using method 'update()'.
|
|
|
|
if (!IS.isReady() && !IS.updatePending()) {
|
2018-03-29 22:26:56 +08:00
|
|
|
++I;
|
2018-04-13 23:19:07 +08:00
|
|
|
continue;
|
2018-03-29 22:26:56 +08:00
|
|
|
}
|
2019-02-13 19:02:42 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER]: Instruction #" << IR
|
|
|
|
<< " promoted to the READY set.\n");
|
|
|
|
|
2018-08-03 20:55:28 +08:00
|
|
|
Ready.emplace_back(IR);
|
|
|
|
ReadySet.emplace_back(IR);
|
|
|
|
|
2019-02-13 19:02:42 +08:00
|
|
|
IR.invalidate();
|
|
|
|
++PromotedElements;
|
|
|
|
std::iter_swap(I, E - PromotedElements);
|
|
|
|
}
|
|
|
|
|
|
|
|
PendingSet.resize(PendingSet.size() - PromotedElements);
|
|
|
|
return PromotedElements;
|
|
|
|
}
|
|
|
|
|
2019-05-06 00:07:27 +08:00
|
|
|
bool Scheduler::promoteToPendingSet(SmallVectorImpl<InstRef> &Pending) {
|
2019-02-13 19:02:42 +08:00
|
|
|
// Scan the set of waiting instructions and promote them to the
|
|
|
|
// pending set if operands are all ready.
|
|
|
|
unsigned RemovedElements = 0;
|
|
|
|
for (auto I = WaitSet.begin(), E = WaitSet.end(); I != E;) {
|
|
|
|
InstRef &IR = *I;
|
|
|
|
if (!IR)
|
|
|
|
break;
|
|
|
|
|
|
|
|
// Check if this instruction is now ready. In case, force
|
2019-05-23 21:42:47 +08:00
|
|
|
// a transition in state using method 'updateDispatched()'.
|
2019-02-13 19:02:42 +08:00
|
|
|
Instruction &IS = *IR.getInstruction();
|
|
|
|
if (IS.isDispatched() && !IS.updateDispatched()) {
|
|
|
|
++I;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER]: Instruction #" << IR
|
|
|
|
<< " promoted to the PENDING set.\n");
|
|
|
|
|
2019-05-06 00:07:27 +08:00
|
|
|
Pending.emplace_back(IR);
|
2019-02-13 19:02:42 +08:00
|
|
|
PendingSet.emplace_back(IR);
|
|
|
|
|
2018-08-03 20:55:28 +08:00
|
|
|
IR.invalidate();
|
|
|
|
++RemovedElements;
|
|
|
|
std::iter_swap(I, E - RemovedElements);
|
2018-03-29 22:26:56 +08:00
|
|
|
}
|
2018-08-03 20:55:28 +08:00
|
|
|
|
|
|
|
WaitSet.resize(WaitSet.size() - RemovedElements);
|
2019-02-13 19:02:42 +08:00
|
|
|
return RemovedElements;
|
2018-03-29 22:26:56 +08:00
|
|
|
}
|
|
|
|
|
2018-05-08 02:29:15 +08:00
|
|
|
InstRef Scheduler::select() {
|
2018-08-03 20:55:28 +08:00
|
|
|
unsigned QueueIndex = ReadySet.size();
|
|
|
|
for (unsigned I = 0, E = ReadySet.size(); I != E; ++I) {
|
2019-02-21 02:01:49 +08:00
|
|
|
InstRef &IR = ReadySet[I];
|
2018-08-22 02:20:16 +08:00
|
|
|
if (QueueIndex == ReadySet.size() ||
|
|
|
|
Strategy->compare(IR, ReadySet[QueueIndex])) {
|
[MCA] Highlight kernel bottlenecks in the summary view.
This patch adds a new flag named -bottleneck-analysis to print out information
about throughput bottlenecks.
MCA knows how to identify and classify dynamic dispatch stalls. However, it
doesn't know how to analyze and highlight kernel bottlenecks. The goal of this
patch is to teach MCA how to correlate increases in backend pressure to backend
stalls (and therefore, the loss of throughput).
From a Scheduler point of view, backend pressure is a function of the scheduler
buffer usage (i.e. how the number of uOps in the scheduler buffers changes over
time). Backend pressure increases (or decreases) when there is a mismatch
between the number of opcodes dispatched, and the number of opcodes issued in
the same cycle. Since buffer resources are limited, continuous increases in
backend pressure would eventually leads to dispatch stalls. So, there is a
strong correlation between dispatch stalls, and how backpressure changed over
time.
This patch teaches how to identify situations where backend pressure increases
due to:
- unavailable pipeline resources.
- data dependencies.
Data dependencies may delay execution of instructions and therefore increase the
time that uOps have to spend in the scheduler buffers. That often translates to
an increase in backend pressure which may eventually lead to a bottleneck.
Contention on pipeline resources may also delay execution of instructions, and
lead to a temporary increase in backend pressure.
Internally, the Scheduler classifies instructions based on whether register /
memory operands are available or not.
An instruction is marked as "ready to execute" only if data dependencies are
fully resolved.
Every cycle, the Scheduler attempts to execute all instructions that are ready
to execute. If an instruction cannot execute because of unavailable pipeline
resources, then the Scheduler internally updates a BusyResourceUnits mask with
the ID of each unavailable resource.
ExecuteStage is responsible for tracking changes in backend pressure. If backend
pressure increases during a cycle because of contention on pipeline resources,
then ExecuteStage sends a "backend pressure" event to the listeners.
That event would contain information about instructions delayed by resource
pressure, as well as the BusyResourceUnits mask.
Note that ExecuteStage also knows how to identify situations where backpressure
increased because of delays introduced by data dependencies.
The SummaryView observes "backend pressure" events and prints out a "bottleneck
report".
Example of bottleneck report:
```
Cycles with backend pressure increase [ 99.89% ]
Throughput Bottlenecks:
Resource Pressure [ 0.00% ]
Data Dependencies: [ 99.89% ]
- Register Dependencies [ 0.00% ]
- Memory Dependencies [ 99.89% ]
```
A bottleneck report is printed out only if increases in backend pressure
eventually caused backend stalls.
About the time complexity:
Time complexity is linear in the number of instructions in the
Scheduler::PendingSet.
The average slowdown tends to be in the range of ~5-6%.
For memory intensive kernels, the slowdown can be significant if flag
-noalias=false is specified. In the worst case scenario I have observed a
slowdown of ~30% when flag -noalias=false was specified.
We can definitely recover part of that slowdown if we optimize class LSUnit (by
doing extra bookkeeping to speedup queries). For now, this new analysis is
disabled by default, and it can be enabled via flag -bottleneck-analysis. Users
of MCA as a library can enable the generation of pressure events through the
constructor of ExecuteStage.
This patch partially addresses https://bugs.llvm.org/show_bug.cgi?id=37494
Differential Revision: https://reviews.llvm.org/D58728
llvm-svn: 355308
2019-03-04 19:52:34 +08:00
|
|
|
Instruction &IS = *IR.getInstruction();
|
|
|
|
uint64_t BusyResourceMask = Resources->checkAvailability(IS.getDesc());
|
|
|
|
IS.setCriticalResourceMask(BusyResourceMask);
|
2019-02-12 01:55:47 +08:00
|
|
|
BusyResourceUnits |= BusyResourceMask;
|
2019-02-11 22:53:04 +08:00
|
|
|
if (!BusyResourceMask)
|
2018-08-03 20:55:28 +08:00
|
|
|
QueueIndex = I;
|
2018-07-06 16:08:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-03 20:55:28 +08:00
|
|
|
if (QueueIndex == ReadySet.size())
|
|
|
|
return InstRef();
|
|
|
|
|
2018-04-24 22:53:16 +08:00
|
|
|
// We found an instruction to issue.
|
2018-08-03 20:55:28 +08:00
|
|
|
InstRef IR = ReadySet[QueueIndex];
|
|
|
|
std::swap(ReadySet[QueueIndex], ReadySet[ReadySet.size() - 1]);
|
|
|
|
ReadySet.pop_back();
|
2018-05-08 02:29:15 +08:00
|
|
|
return IR;
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
|
|
|
|
2018-08-03 20:55:28 +08:00
|
|
|
void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
|
|
|
|
unsigned RemovedElements = 0;
|
2018-08-18 02:06:01 +08:00
|
|
|
for (auto I = IssuedSet.begin(), E = IssuedSet.end(); I != E;) {
|
2018-08-03 20:55:28 +08:00
|
|
|
InstRef &IR = *I;
|
2018-10-25 04:27:47 +08:00
|
|
|
if (!IR)
|
2018-08-03 20:55:28 +08:00
|
|
|
break;
|
|
|
|
Instruction &IS = *IR.getInstruction();
|
|
|
|
if (!IS.isExecuted()) {
|
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER]: Instruction #" << IR
|
2018-05-14 20:53:11 +08:00
|
|
|
<< " is still executing.\n");
|
2018-03-08 21:05:02 +08:00
|
|
|
++I;
|
2018-08-03 20:55:28 +08:00
|
|
|
continue;
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
2018-08-03 20:55:28 +08:00
|
|
|
|
2018-08-20 22:41:36 +08:00
|
|
|
// Instruction IR has completed execution.
|
2018-11-30 20:49:30 +08:00
|
|
|
LSU.onInstructionExecuted(IR);
|
2018-08-03 20:55:28 +08:00
|
|
|
Executed.emplace_back(IR);
|
|
|
|
++RemovedElements;
|
|
|
|
IR.invalidate();
|
|
|
|
std::iter_swap(I, E - RemovedElements);
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
2018-08-03 20:55:28 +08:00
|
|
|
|
|
|
|
IssuedSet.resize(IssuedSet.size() - RemovedElements);
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
|
|
|
|
[MCA] Highlight kernel bottlenecks in the summary view.
This patch adds a new flag named -bottleneck-analysis to print out information
about throughput bottlenecks.
MCA knows how to identify and classify dynamic dispatch stalls. However, it
doesn't know how to analyze and highlight kernel bottlenecks. The goal of this
patch is to teach MCA how to correlate increases in backend pressure to backend
stalls (and therefore, the loss of throughput).
From a Scheduler point of view, backend pressure is a function of the scheduler
buffer usage (i.e. how the number of uOps in the scheduler buffers changes over
time). Backend pressure increases (or decreases) when there is a mismatch
between the number of opcodes dispatched, and the number of opcodes issued in
the same cycle. Since buffer resources are limited, continuous increases in
backend pressure would eventually leads to dispatch stalls. So, there is a
strong correlation between dispatch stalls, and how backpressure changed over
time.
This patch teaches how to identify situations where backend pressure increases
due to:
- unavailable pipeline resources.
- data dependencies.
Data dependencies may delay execution of instructions and therefore increase the
time that uOps have to spend in the scheduler buffers. That often translates to
an increase in backend pressure which may eventually lead to a bottleneck.
Contention on pipeline resources may also delay execution of instructions, and
lead to a temporary increase in backend pressure.
Internally, the Scheduler classifies instructions based on whether register /
memory operands are available or not.
An instruction is marked as "ready to execute" only if data dependencies are
fully resolved.
Every cycle, the Scheduler attempts to execute all instructions that are ready
to execute. If an instruction cannot execute because of unavailable pipeline
resources, then the Scheduler internally updates a BusyResourceUnits mask with
the ID of each unavailable resource.
ExecuteStage is responsible for tracking changes in backend pressure. If backend
pressure increases during a cycle because of contention on pipeline resources,
then ExecuteStage sends a "backend pressure" event to the listeners.
That event would contain information about instructions delayed by resource
pressure, as well as the BusyResourceUnits mask.
Note that ExecuteStage also knows how to identify situations where backpressure
increased because of delays introduced by data dependencies.
The SummaryView observes "backend pressure" events and prints out a "bottleneck
report".
Example of bottleneck report:
```
Cycles with backend pressure increase [ 99.89% ]
Throughput Bottlenecks:
Resource Pressure [ 0.00% ]
Data Dependencies: [ 99.89% ]
- Register Dependencies [ 0.00% ]
- Memory Dependencies [ 99.89% ]
```
A bottleneck report is printed out only if increases in backend pressure
eventually caused backend stalls.
About the time complexity:
Time complexity is linear in the number of instructions in the
Scheduler::PendingSet.
The average slowdown tends to be in the range of ~5-6%.
For memory intensive kernels, the slowdown can be significant if flag
-noalias=false is specified. In the worst case scenario I have observed a
slowdown of ~30% when flag -noalias=false was specified.
We can definitely recover part of that slowdown if we optimize class LSUnit (by
doing extra bookkeeping to speedup queries). For now, this new analysis is
disabled by default, and it can be enabled via flag -bottleneck-analysis. Users
of MCA as a library can enable the generation of pressure events through the
constructor of ExecuteStage.
This patch partially addresses https://bugs.llvm.org/show_bug.cgi?id=37494
Differential Revision: https://reviews.llvm.org/D58728
llvm-svn: 355308
2019-03-04 19:52:34 +08:00
|
|
|
uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts) {
|
|
|
|
Insts.insert(Insts.end(), ReadySet.begin(), ReadySet.end());
|
|
|
|
return BusyResourceUnits;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Scheduler::analyzeDataDependencies(SmallVectorImpl<InstRef> &RegDeps,
|
|
|
|
SmallVectorImpl<InstRef> &MemDeps) {
|
|
|
|
const auto EndIt = PendingSet.end() - NumDispatchedToThePendingSet;
|
|
|
|
for (InstRef &IR : make_range(PendingSet.begin(), EndIt)) {
|
|
|
|
Instruction &IS = *IR.getInstruction();
|
|
|
|
if (Resources->checkAvailability(IS.getDesc()))
|
|
|
|
continue;
|
|
|
|
|
2019-05-23 21:42:47 +08:00
|
|
|
if (IS.isReady() || (IS.isMemOp() && LSU.isReady(IR) != IR))
|
[MCA] Highlight kernel bottlenecks in the summary view.
This patch adds a new flag named -bottleneck-analysis to print out information
about throughput bottlenecks.
MCA knows how to identify and classify dynamic dispatch stalls. However, it
doesn't know how to analyze and highlight kernel bottlenecks. The goal of this
patch is to teach MCA how to correlate increases in backend pressure to backend
stalls (and therefore, the loss of throughput).
From a Scheduler point of view, backend pressure is a function of the scheduler
buffer usage (i.e. how the number of uOps in the scheduler buffers changes over
time). Backend pressure increases (or decreases) when there is a mismatch
between the number of opcodes dispatched, and the number of opcodes issued in
the same cycle. Since buffer resources are limited, continuous increases in
backend pressure would eventually leads to dispatch stalls. So, there is a
strong correlation between dispatch stalls, and how backpressure changed over
time.
This patch teaches how to identify situations where backend pressure increases
due to:
- unavailable pipeline resources.
- data dependencies.
Data dependencies may delay execution of instructions and therefore increase the
time that uOps have to spend in the scheduler buffers. That often translates to
an increase in backend pressure which may eventually lead to a bottleneck.
Contention on pipeline resources may also delay execution of instructions, and
lead to a temporary increase in backend pressure.
Internally, the Scheduler classifies instructions based on whether register /
memory operands are available or not.
An instruction is marked as "ready to execute" only if data dependencies are
fully resolved.
Every cycle, the Scheduler attempts to execute all instructions that are ready
to execute. If an instruction cannot execute because of unavailable pipeline
resources, then the Scheduler internally updates a BusyResourceUnits mask with
the ID of each unavailable resource.
ExecuteStage is responsible for tracking changes in backend pressure. If backend
pressure increases during a cycle because of contention on pipeline resources,
then ExecuteStage sends a "backend pressure" event to the listeners.
That event would contain information about instructions delayed by resource
pressure, as well as the BusyResourceUnits mask.
Note that ExecuteStage also knows how to identify situations where backpressure
increased because of delays introduced by data dependencies.
The SummaryView observes "backend pressure" events and prints out a "bottleneck
report".
Example of bottleneck report:
```
Cycles with backend pressure increase [ 99.89% ]
Throughput Bottlenecks:
Resource Pressure [ 0.00% ]
Data Dependencies: [ 99.89% ]
- Register Dependencies [ 0.00% ]
- Memory Dependencies [ 99.89% ]
```
A bottleneck report is printed out only if increases in backend pressure
eventually caused backend stalls.
About the time complexity:
Time complexity is linear in the number of instructions in the
Scheduler::PendingSet.
The average slowdown tends to be in the range of ~5-6%.
For memory intensive kernels, the slowdown can be significant if flag
-noalias=false is specified. In the worst case scenario I have observed a
slowdown of ~30% when flag -noalias=false was specified.
We can definitely recover part of that slowdown if we optimize class LSUnit (by
doing extra bookkeeping to speedup queries). For now, this new analysis is
disabled by default, and it can be enabled via flag -bottleneck-analysis. Users
of MCA as a library can enable the generation of pressure events through the
constructor of ExecuteStage.
This patch partially addresses https://bugs.llvm.org/show_bug.cgi?id=37494
Differential Revision: https://reviews.llvm.org/D58728
llvm-svn: 355308
2019-03-04 19:52:34 +08:00
|
|
|
MemDeps.emplace_back(IR);
|
2019-05-23 21:42:47 +08:00
|
|
|
else
|
[MCA] Highlight kernel bottlenecks in the summary view.
This patch adds a new flag named -bottleneck-analysis to print out information
about throughput bottlenecks.
MCA knows how to identify and classify dynamic dispatch stalls. However, it
doesn't know how to analyze and highlight kernel bottlenecks. The goal of this
patch is to teach MCA how to correlate increases in backend pressure to backend
stalls (and therefore, the loss of throughput).
From a Scheduler point of view, backend pressure is a function of the scheduler
buffer usage (i.e. how the number of uOps in the scheduler buffers changes over
time). Backend pressure increases (or decreases) when there is a mismatch
between the number of opcodes dispatched, and the number of opcodes issued in
the same cycle. Since buffer resources are limited, continuous increases in
backend pressure would eventually leads to dispatch stalls. So, there is a
strong correlation between dispatch stalls, and how backpressure changed over
time.
This patch teaches how to identify situations where backend pressure increases
due to:
- unavailable pipeline resources.
- data dependencies.
Data dependencies may delay execution of instructions and therefore increase the
time that uOps have to spend in the scheduler buffers. That often translates to
an increase in backend pressure which may eventually lead to a bottleneck.
Contention on pipeline resources may also delay execution of instructions, and
lead to a temporary increase in backend pressure.
Internally, the Scheduler classifies instructions based on whether register /
memory operands are available or not.
An instruction is marked as "ready to execute" only if data dependencies are
fully resolved.
Every cycle, the Scheduler attempts to execute all instructions that are ready
to execute. If an instruction cannot execute because of unavailable pipeline
resources, then the Scheduler internally updates a BusyResourceUnits mask with
the ID of each unavailable resource.
ExecuteStage is responsible for tracking changes in backend pressure. If backend
pressure increases during a cycle because of contention on pipeline resources,
then ExecuteStage sends a "backend pressure" event to the listeners.
That event would contain information about instructions delayed by resource
pressure, as well as the BusyResourceUnits mask.
Note that ExecuteStage also knows how to identify situations where backpressure
increased because of delays introduced by data dependencies.
The SummaryView observes "backend pressure" events and prints out a "bottleneck
report".
Example of bottleneck report:
```
Cycles with backend pressure increase [ 99.89% ]
Throughput Bottlenecks:
Resource Pressure [ 0.00% ]
Data Dependencies: [ 99.89% ]
- Register Dependencies [ 0.00% ]
- Memory Dependencies [ 99.89% ]
```
A bottleneck report is printed out only if increases in backend pressure
eventually caused backend stalls.
About the time complexity:
Time complexity is linear in the number of instructions in the
Scheduler::PendingSet.
The average slowdown tends to be in the range of ~5-6%.
For memory intensive kernels, the slowdown can be significant if flag
-noalias=false is specified. In the worst case scenario I have observed a
slowdown of ~30% when flag -noalias=false was specified.
We can definitely recover part of that slowdown if we optimize class LSUnit (by
doing extra bookkeeping to speedup queries). For now, this new analysis is
disabled by default, and it can be enabled via flag -bottleneck-analysis. Users
of MCA as a library can enable the generation of pressure events through the
constructor of ExecuteStage.
This patch partially addresses https://bugs.llvm.org/show_bug.cgi?id=37494
Differential Revision: https://reviews.llvm.org/D58728
llvm-svn: 355308
2019-03-04 19:52:34 +08:00
|
|
|
RegDeps.emplace_back(IR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-21 20:40:15 +08:00
|
|
|
void Scheduler::cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
|
|
|
|
SmallVectorImpl<InstRef> &Executed,
|
2019-05-06 00:07:27 +08:00
|
|
|
SmallVectorImpl<InstRef> &Pending,
|
2018-08-21 20:40:15 +08:00
|
|
|
SmallVectorImpl<InstRef> &Ready) {
|
|
|
|
// Release consumed resources.
|
2018-06-14 09:20:18 +08:00
|
|
|
Resources->cycleEvent(Freed);
|
2018-08-21 20:40:15 +08:00
|
|
|
|
|
|
|
for (InstRef &IR : IssuedSet)
|
|
|
|
IR.getInstruction()->cycleEvent();
|
|
|
|
updateIssuedSet(Executed);
|
|
|
|
|
2019-02-13 19:02:42 +08:00
|
|
|
for (InstRef &IR : PendingSet)
|
|
|
|
IR.getInstruction()->cycleEvent();
|
|
|
|
|
2018-08-21 20:40:15 +08:00
|
|
|
for (InstRef &IR : WaitSet)
|
|
|
|
IR.getInstruction()->cycleEvent();
|
2018-08-22 02:20:16 +08:00
|
|
|
|
2019-05-06 00:07:27 +08:00
|
|
|
promoteToPendingSet(Pending);
|
2018-08-21 20:40:15 +08:00
|
|
|
promoteToReadySet(Ready);
|
2019-02-12 01:55:47 +08:00
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
NumDispatchedToThePendingSet = 0;
|
2019-02-12 01:55:47 +08:00
|
|
|
BusyResourceUnits = 0;
|
2018-03-08 21:05:02 +08:00
|
|
|
}
|
|
|
|
|
2018-08-20 22:41:36 +08:00
|
|
|
bool Scheduler::mustIssueImmediately(const InstRef &IR) const {
|
2019-01-04 23:08:38 +08:00
|
|
|
const InstrDesc &Desc = IR.getInstruction()->getDesc();
|
|
|
|
if (Desc.isZeroLatency())
|
|
|
|
return true;
|
2018-08-20 22:41:36 +08:00
|
|
|
// Instructions that use an in-order dispatch/issue processor resource must be
|
|
|
|
// issued immediately to the pipeline(s). Any other in-order buffered
|
|
|
|
// resources (i.e. BufferSize=1) is consumed.
|
2019-01-04 23:08:38 +08:00
|
|
|
return Desc.MustIssueImmediately;
|
2018-08-20 22:41:36 +08:00
|
|
|
}
|
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
bool Scheduler::dispatch(const InstRef &IR) {
|
|
|
|
const Instruction &IS = *IR.getInstruction();
|
|
|
|
const InstrDesc &Desc = IS.getDesc();
|
2018-08-20 22:41:36 +08:00
|
|
|
Resources->reserveBuffers(Desc.Buffers);
|
|
|
|
|
2018-06-14 09:20:18 +08:00
|
|
|
// If necessary, reserve queue entries in the load-store unit (LSU).
|
2019-02-21 02:01:49 +08:00
|
|
|
if (IS.isMemOp())
|
2018-11-30 20:49:30 +08:00
|
|
|
LSU.dispatch(IR);
|
2018-08-20 22:41:36 +08:00
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
if (IS.isPending()) {
|
2019-02-13 19:02:42 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR
|
|
|
|
<< " to the PendingSet\n");
|
|
|
|
PendingSet.push_back(IR);
|
2019-02-21 02:01:49 +08:00
|
|
|
++NumDispatchedToThePendingSet;
|
|
|
|
return false;
|
2019-02-13 19:02:42 +08:00
|
|
|
}
|
|
|
|
|
2019-05-06 00:07:27 +08:00
|
|
|
// Memory operations that are not in a ready state are initially assigned to
|
|
|
|
// the WaitSet.
|
2019-05-23 21:42:47 +08:00
|
|
|
if (!IS.isReady() || (IS.isMemOp() && LSU.isReady(IR) != IR)) {
|
2018-08-03 20:55:28 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the WaitSet\n");
|
|
|
|
WaitSet.push_back(IR);
|
2019-02-21 02:01:49 +08:00
|
|
|
return false;
|
2018-06-14 09:20:18 +08:00
|
|
|
}
|
2018-03-21 02:20:39 +08:00
|
|
|
|
2018-08-20 22:41:36 +08:00
|
|
|
// Don't add a zero-latency instruction to the Ready queue.
|
|
|
|
// A zero-latency instruction doesn't consume any scheduler resources. That is
|
|
|
|
// because it doesn't need to be executed, and it is often removed at register
|
|
|
|
// renaming stage. For example, register-register moves are often optimized at
|
|
|
|
// register renaming stage by simply updating register aliases. On some
|
|
|
|
// targets, zero-idiom instructions (for example: a xor that clears the value
|
|
|
|
// of a register) are treated specially, and are often eliminated at register
|
|
|
|
// renaming stage.
|
|
|
|
if (!mustIssueImmediately(IR)) {
|
2018-08-03 20:55:28 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the ReadySet\n");
|
|
|
|
ReadySet.push_back(IR);
|
2018-06-14 09:20:18 +08:00
|
|
|
}
|
2018-08-20 22:41:36 +08:00
|
|
|
|
2019-02-21 02:01:49 +08:00
|
|
|
return true;
|
2018-03-21 02:20:39 +08:00
|
|
|
}
|
|
|
|
|
2018-03-08 21:05:02 +08:00
|
|
|
} // namespace mca
|
2018-10-30 23:56:08 +08:00
|
|
|
} // namespace llvm
|