forked from OSchip/llvm-project
[MCA] Add an experimental MicroOpQueue stage.
This patch adds an experimental stage named MicroOpQueueStage. MicroOpQueueStage can be used to simulate a hardware micro-op queue (basically, a decoupling queue between 'decode' and 'dispatch'). Users can specify a queue size, as well as a optional MaxIPC (which - in the absence of a "Decoders" stage - can be used to simulate a different throughput from the decoders). This stage is added to the default pipeline between the EntryStage and the DispatchStage only if PipelineOption::MicroOpQueue is different than zero. By default, llvm-mca sets PipelineOption::MicroOpQueue to the value of hidden flag -micro-op-queue-size. Throughput from the decoder can be simulated via another hidden flag named -decoder-throughput. That flag allows us to quickly experiment with different frontend throughputs. For targets that declare a loop buffer, flag -decoder-throughput allows users to do multiple runs, each time simulating a different throughput from the decoders. This stage can/will be extended in future. For example, we could add a "buffer full" event to notify bottlenecks caused by backpressure. flag -decoder-throughput would probably go away if in future we delegate to another stage (DecoderStage?) the simulation of a (potentially variable) throughput from the decoders. For now, flag -decoder-throughput is "good enough" to run some simple experiments. Differential Revision: https://reviews.llvm.org/D59928 llvm-svn: 357248
This commit is contained in:
parent
2b766ed774
commit
e074ac60b4
|
@ -31,11 +31,15 @@ namespace mca {
|
|||
/// This is a convenience struct to hold the parameters necessary for creating
|
||||
/// the pre-built "default" out-of-order pipeline.
|
||||
struct PipelineOptions {
|
||||
PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
|
||||
bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
|
||||
: DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
|
||||
PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
|
||||
unsigned LQS, unsigned SQS, bool NoAlias,
|
||||
bool ShouldEnableBottleneckAnalysis = false)
|
||||
: MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
|
||||
DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
|
||||
StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
|
||||
EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
|
||||
unsigned MicroOpQueueSize;
|
||||
unsigned DecodersThroughput; // Instructions per cycle.
|
||||
unsigned DispatchWidth;
|
||||
unsigned RegisterFileSize;
|
||||
unsigned LoadQueueSize;
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
//===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
/// \file
|
||||
///
|
||||
/// This file defines a stage that implements a queue of micro opcodes.
|
||||
/// It can be used to simulate a hardware micro-op queue that serves opcodes to
|
||||
/// the out of order backend.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
|
||||
#define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/MCA/Stages/Stage.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace mca {
|
||||
|
||||
/// A stage that simulates a queue of instruction opcodes.
|
||||
class MicroOpQueueStage : public Stage {
|
||||
SmallVector<InstRef, 8> Buffer;
|
||||
unsigned NextAvailableSlotIdx;
|
||||
unsigned CurrentInstructionSlotIdx;
|
||||
|
||||
// Limits the number of instructions that can be written to this buffer every
|
||||
// cycle. A value of zero means that there is no limit to the instruction
|
||||
// throughput in input.
|
||||
const unsigned MaxIPC;
|
||||
unsigned CurrentIPC;
|
||||
|
||||
// Number of entries that are available during this cycle.
|
||||
unsigned AvailableEntries;
|
||||
|
||||
// True if instructions dispatched to this stage don't need to wait for the
|
||||
// next cycle before moving to the next stage.
|
||||
// False if this buffer acts as a one cycle delay in the execution pipeline.
|
||||
bool IsZeroLatencyStage;
|
||||
|
||||
MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
|
||||
MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
|
||||
|
||||
// By default, an instruction consumes a number of buffer entries equal to its
|
||||
// number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`). The
|
||||
// number of entries consumed by an instruction is normalized to the
|
||||
// minimum value between NumMicroOpcodes and the buffer size. This is to avoid
|
||||
// problems with (microcoded) instructions that generate a number of micro
|
||||
// opcodes than doesn't fit in the buffer.
|
||||
unsigned getNormalizedOpcodes(const InstRef &IR) const {
|
||||
unsigned NormalizedOpcodes =
|
||||
std::min(static_cast<unsigned>(Buffer.size()),
|
||||
IR.getInstruction()->getDesc().NumMicroOps);
|
||||
return NormalizedOpcodes ? NormalizedOpcodes : 1U;
|
||||
}
|
||||
|
||||
Error moveInstructions();
|
||||
|
||||
public:
|
||||
MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
|
||||
bool ZeroLatencyStage = true);
|
||||
|
||||
bool isAvailable(const InstRef &IR) const override {
|
||||
if (MaxIPC && CurrentIPC == MaxIPC)
|
||||
return false;
|
||||
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
|
||||
if (NormalizedOpcodes > AvailableEntries)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool hasWorkToComplete() const override {
|
||||
return AvailableEntries != Buffer.size();
|
||||
}
|
||||
|
||||
Error execute(InstRef &IR) override;
|
||||
Error cycleStart() override;
|
||||
Error cycleEnd() override;
|
||||
};
|
||||
|
||||
} // namespace mca
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
|
|
@ -14,6 +14,7 @@ add_llvm_library(LLVMMCA
|
|||
Stages/EntryStage.cpp
|
||||
Stages/ExecuteStage.cpp
|
||||
Stages/InstructionTables.cpp
|
||||
Stages/MicroOpQueueStage.cpp
|
||||
Stages/RetireStage.cpp
|
||||
Stages/Stage.cpp
|
||||
Support.cpp
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "llvm/MCA/Stages/DispatchStage.h"
|
||||
#include "llvm/MCA/Stages/EntryStage.h"
|
||||
#include "llvm/MCA/Stages/ExecuteStage.h"
|
||||
#include "llvm/MCA/Stages/MicroOpQueueStage.h"
|
||||
#include "llvm/MCA/Stages/RetireStage.h"
|
||||
|
||||
namespace llvm {
|
||||
|
@ -55,6 +56,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
|
|||
// Build the pipeline.
|
||||
auto StagePipeline = llvm::make_unique<Pipeline>();
|
||||
StagePipeline->appendStage(std::move(Fetch));
|
||||
if (Opts.MicroOpQueueSize)
|
||||
StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
|
||||
Opts.MicroOpQueueSize, Opts.DecodersThroughput));
|
||||
StagePipeline->appendStage(std::move(Dispatch));
|
||||
StagePipeline->appendStage(std::move(Execute));
|
||||
StagePipeline->appendStage(std::move(Retire));
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
/// \file
|
||||
///
|
||||
/// This file defines the MicroOpQueueStage.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/MCA/Stages/MicroOpQueueStage.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace mca {
|
||||
|
||||
#define DEBUG_TYPE "llvm-mca"
|
||||
|
||||
Error MicroOpQueueStage::moveInstructions() {
|
||||
InstRef IR = Buffer[CurrentInstructionSlotIdx];
|
||||
while (IR && checkNextStage(IR)) {
|
||||
if (llvm::Error Val = moveToTheNextStage(IR))
|
||||
return Val;
|
||||
|
||||
Buffer[CurrentInstructionSlotIdx].invalidate();
|
||||
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
|
||||
CurrentInstructionSlotIdx += NormalizedOpcodes;
|
||||
CurrentInstructionSlotIdx %= Buffer.size();
|
||||
AvailableEntries += NormalizedOpcodes;
|
||||
IR = Buffer[CurrentInstructionSlotIdx];
|
||||
}
|
||||
|
||||
return llvm::ErrorSuccess();
|
||||
}
|
||||
|
||||
MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
|
||||
bool ZeroLatencyStage)
|
||||
: NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
|
||||
CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
|
||||
Buffer.resize(Size ? Size : 1);
|
||||
AvailableEntries = Buffer.size();
|
||||
}
|
||||
|
||||
Error MicroOpQueueStage::execute(InstRef &IR) {
|
||||
Buffer[NextAvailableSlotIdx] = IR;
|
||||
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
|
||||
NextAvailableSlotIdx += NormalizedOpcodes;
|
||||
NextAvailableSlotIdx %= Buffer.size();
|
||||
AvailableEntries -= NormalizedOpcodes;
|
||||
++CurrentIPC;
|
||||
return llvm::ErrorSuccess();
|
||||
}
|
||||
|
||||
Error MicroOpQueueStage::cycleStart() {
|
||||
CurrentIPC = 0;
|
||||
if (!IsZeroLatencyStage)
|
||||
return moveInstructions();
|
||||
return llvm::ErrorSuccess();
|
||||
}
|
||||
|
||||
Error MicroOpQueueStage::cycleEnd() {
|
||||
if (IsZeroLatencyStage)
|
||||
return moveInstructions();
|
||||
return llvm::ErrorSuccess();
|
||||
}
|
||||
|
||||
} // namespace mca
|
||||
} // namespace llvm
|
|
@ -0,0 +1,105 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
|
||||
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
|
||||
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
|
||||
|
||||
add %eax, %eax
|
||||
add %ebx, %ebx
|
||||
add %ecx, %ecx
|
||||
add %edx, %edx
|
||||
|
||||
# BTVER2-DEC-2: Iterations: 1500
|
||||
# BTVER2-DEC-2-NEXT: Instructions: 6000
|
||||
# BTVER2-DEC-2-NEXT: Total Cycles: 3003
|
||||
# BTVER2-DEC-2-NEXT: Total uOps: 6000
|
||||
|
||||
# BTVER2-DEC-2: Dispatch Width: 2
|
||||
# BTVER2-DEC-2-NEXT: uOps Per Cycle: 2.00
|
||||
# BTVER2-DEC-2-NEXT: IPC: 2.00
|
||||
# BTVER2-DEC-2-NEXT: Block RThroughput: 2.0
|
||||
|
||||
# BTVER2-DEC-1: Iterations: 1500
|
||||
# BTVER2-DEC-1-NEXT: Instructions: 6000
|
||||
# BTVER2-DEC-1-NEXT: Total Cycles: 6003
|
||||
# BTVER2-DEC-1-NEXT: Total uOps: 6000
|
||||
|
||||
# BTVER2-UOPQ-1: Iterations: 1500
|
||||
# BTVER2-UOPQ-1-NEXT: Instructions: 6000
|
||||
# BTVER2-UOPQ-1-NEXT: Total Cycles: 6003
|
||||
# BTVER2-UOPQ-1-NEXT: Total uOps: 6000
|
||||
|
||||
# BTVER2-UOPQ-2: Iterations: 1500
|
||||
# BTVER2-UOPQ-2-NEXT: Instructions: 6000
|
||||
# BTVER2-UOPQ-2-NEXT: Total Cycles: 3003
|
||||
# BTVER2-UOPQ-2-NEXT: Total uOps: 6000
|
||||
|
||||
# HASWELL-DEC-2: Iterations: 1500
|
||||
# HASWELL-DEC-2-NEXT: Instructions: 6000
|
||||
# HASWELL-DEC-2-NEXT: Total Cycles: 3003
|
||||
# HASWELL-DEC-2-NEXT: Total uOps: 6000
|
||||
|
||||
# HASWELL-UOPQ-1: Iterations: 1500
|
||||
# HASWELL-UOPQ-1-NEXT: Instructions: 6000
|
||||
# HASWELL-UOPQ-1-NEXT: Total Cycles: 6003
|
||||
# HASWELL-UOPQ-1-NEXT: Total uOps: 6000
|
||||
|
||||
# HASWELL-UOPQ-2: Iterations: 1500
|
||||
# HASWELL-UOPQ-2-NEXT: Instructions: 6000
|
||||
# HASWELL-UOPQ-2-NEXT: Total Cycles: 3003
|
||||
# HASWELL-UOPQ-2-NEXT: Total uOps: 6000
|
||||
|
||||
# HASWELL-UOPQ-3: Iterations: 1500
|
||||
# HASWELL-UOPQ-3-NEXT: Instructions: 6000
|
||||
# HASWELL-UOPQ-3-NEXT: Total Cycles: 2003
|
||||
# HASWELL-UOPQ-3-NEXT: Total uOps: 6000
|
||||
|
||||
# HASWELL-UOPQ-4: Iterations: 1500
|
||||
# HASWELL-UOPQ-4-NEXT: Instructions: 6000
|
||||
# HASWELL-UOPQ-4-NEXT: Total Cycles: 1503
|
||||
# HASWELL-UOPQ-4-NEXT: Total uOps: 6000
|
||||
|
||||
# BTVER2-DEC-1: Dispatch Width: 2
|
||||
# BTVER2-DEC-1-NEXT: uOps Per Cycle: 1.00
|
||||
# BTVER2-DEC-1-NEXT: IPC: 1.00
|
||||
# BTVER2-DEC-1-NEXT: Block RThroughput: 2.0
|
||||
|
||||
# BTVER2-UOPQ-1: Dispatch Width: 2
|
||||
# BTVER2-UOPQ-1-NEXT: uOps Per Cycle: 1.00
|
||||
# BTVER2-UOPQ-1-NEXT: IPC: 1.00
|
||||
# BTVER2-UOPQ-1-NEXT: Block RThroughput: 2.0
|
||||
|
||||
# BTVER2-UOPQ-2: Dispatch Width: 2
|
||||
# BTVER2-UOPQ-2-NEXT: uOps Per Cycle: 2.00
|
||||
# BTVER2-UOPQ-2-NEXT: IPC: 2.00
|
||||
# BTVER2-UOPQ-2-NEXT: Block RThroughput: 2.0
|
||||
|
||||
# HASWELL-DEC-2: Dispatch Width: 4
|
||||
# HASWELL-DEC-2-NEXT: uOps Per Cycle: 2.00
|
||||
# HASWELL-DEC-2-NEXT: IPC: 2.00
|
||||
# HASWELL-DEC-2-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# HASWELL-UOPQ-1: Dispatch Width: 4
|
||||
# HASWELL-UOPQ-1-NEXT: uOps Per Cycle: 1.00
|
||||
# HASWELL-UOPQ-1-NEXT: IPC: 1.00
|
||||
# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# HASWELL-UOPQ-2: Dispatch Width: 4
|
||||
# HASWELL-UOPQ-2-NEXT: uOps Per Cycle: 2.00
|
||||
# HASWELL-UOPQ-2-NEXT: IPC: 2.00
|
||||
# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# HASWELL-UOPQ-3: Dispatch Width: 4
|
||||
# HASWELL-UOPQ-3-NEXT: uOps Per Cycle: 3.00
|
||||
# HASWELL-UOPQ-3-NEXT: IPC: 3.00
|
||||
# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# HASWELL-UOPQ-4: Dispatch Width: 4
|
||||
# HASWELL-UOPQ-4-NEXT: uOps Per Cycle: 3.99
|
||||
# HASWELL-UOPQ-4-NEXT: IPC: 3.99
|
||||
# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0
|
|
@ -100,6 +100,17 @@ static cl::opt<unsigned>
|
|||
"be used for register mappings"),
|
||||
cl::cat(ToolOptions), cl::init(0));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
MicroOpQueue("micro-op-queue-size", cl::Hidden,
|
||||
cl::desc("Number of entries in the micro-op queue"),
|
||||
cl::cat(ToolOptions), cl::init(0));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
DecoderThroughput("decoder-throughput", cl::Hidden,
|
||||
cl::desc("Maximum throughput from the decoders "
|
||||
"(instructions per cycle)"),
|
||||
cl::cat(ToolOptions), cl::init(0));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintRegisterFileStats("register-file-stats",
|
||||
cl::desc("Print register file statistics"),
|
||||
|
@ -387,9 +398,9 @@ int main(int argc, char **argv) {
|
|||
// Create a context to control ownership of the pipeline hardware.
|
||||
mca::Context MCA(*MRI, *STI);
|
||||
|
||||
mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
|
||||
StoreQueueSize, AssumeNoAlias,
|
||||
EnableBottleneckAnalysis);
|
||||
mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
|
||||
RegisterFileSize, LoadQueueSize, StoreQueueSize,
|
||||
AssumeNoAlias, EnableBottleneckAnalysis);
|
||||
|
||||
// Number each region in the sequence.
|
||||
unsigned RegionIdx = 0;
|
||||
|
|
Loading…
Reference in New Issue