[MCA] Add an experimental MicroOpQueue stage.

This patch adds an experimental stage named MicroOpQueueStage.
MicroOpQueueStage can be used to simulate a hardware micro-op queue (basically,
a decoupling queue between 'decode' and 'dispatch').  Users can specify a queue
size, as well as a optional MaxIPC (which - in the absence of a "Decoders" stage
- can be used to simulate a different throughput from the decoders).

This stage is added to the default pipeline between the EntryStage and the
DispatchStage only if PipelineOption::MicroOpQueue is different than zero. By
default, llvm-mca sets PipelineOption::MicroOpQueue to the value of hidden flag
-micro-op-queue-size.

Throughput from the decoder can be simulated via another hidden flag named
-decoder-throughput.  That flag allows us to quickly experiment with different
frontend throughputs.  For targets that declare a loop buffer, flag
-decoder-throughput allows users to do multiple runs, each time simulating a
different throughput from the decoders.

This stage can/will be extended in future. For example, we could add a "buffer
full" event to notify bottlenecks caused by backpressure. flag
-decoder-throughput would probably go away if in future we delegate to another
stage (DecoderStage?) the simulation of a (potentially variable) throughput from
the decoders. For now, flag -decoder-throughput is "good enough" to run some
simple experiments.

Differential Revision: https://reviews.llvm.org/D59928

llvm-svn: 357248
This commit is contained in:
Andrea Di Biagio 2019-03-29 12:15:37 +00:00
parent 2b766ed774
commit e074ac60b4
7 changed files with 289 additions and 6 deletions

View File

@ -31,11 +31,15 @@ namespace mca {
/// This is a convenience struct to hold the parameters necessary for creating
/// the pre-built "default" out-of-order pipeline.
struct PipelineOptions {
PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
: DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
unsigned LQS, unsigned SQS, bool NoAlias,
bool ShouldEnableBottleneckAnalysis = false)
: MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
unsigned MicroOpQueueSize;
unsigned DecodersThroughput; // Instructions per cycle.
unsigned DispatchWidth;
unsigned RegisterFileSize;
unsigned LoadQueueSize;

View File

@ -0,0 +1,88 @@
//===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file defines a stage that implements a queue of micro opcodes.
/// It can be used to simulate a hardware micro-op queue that serves opcodes to
/// the out of order backend.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
#define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
#include "llvm/ADT/SmallVector.h"
#include "llvm/MCA/Stages/Stage.h"
namespace llvm {
namespace mca {
/// A stage that simulates a queue of instruction opcodes.
class MicroOpQueueStage : public Stage {
SmallVector<InstRef, 8> Buffer;
unsigned NextAvailableSlotIdx;
unsigned CurrentInstructionSlotIdx;
// Limits the number of instructions that can be written to this buffer every
// cycle. A value of zero means that there is no limit to the instruction
// throughput in input.
const unsigned MaxIPC;
unsigned CurrentIPC;
// Number of entries that are available during this cycle.
unsigned AvailableEntries;
// True if instructions dispatched to this stage don't need to wait for the
// next cycle before moving to the next stage.
// False if this buffer acts as a one cycle delay in the execution pipeline.
bool IsZeroLatencyStage;
MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
// By default, an instruction consumes a number of buffer entries equal to its
// number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`). The
// number of entries consumed by an instruction is normalized to the
// minimum value between NumMicroOpcodes and the buffer size. This is to avoid
// problems with (microcoded) instructions that generate a number of micro
// opcodes than doesn't fit in the buffer.
unsigned getNormalizedOpcodes(const InstRef &IR) const {
unsigned NormalizedOpcodes =
std::min(static_cast<unsigned>(Buffer.size()),
IR.getInstruction()->getDesc().NumMicroOps);
return NormalizedOpcodes ? NormalizedOpcodes : 1U;
}
Error moveInstructions();
public:
MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
bool ZeroLatencyStage = true);
bool isAvailable(const InstRef &IR) const override {
if (MaxIPC && CurrentIPC == MaxIPC)
return false;
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
if (NormalizedOpcodes > AvailableEntries)
return false;
return true;
}
bool hasWorkToComplete() const override {
return AvailableEntries != Buffer.size();
}
Error execute(InstRef &IR) override;
Error cycleStart() override;
Error cycleEnd() override;
};
} // namespace mca
} // namespace llvm
#endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H

View File

@ -14,6 +14,7 @@ add_llvm_library(LLVMMCA
Stages/EntryStage.cpp
Stages/ExecuteStage.cpp
Stages/InstructionTables.cpp
Stages/MicroOpQueueStage.cpp
Stages/RetireStage.cpp
Stages/Stage.cpp
Support.cpp

View File

@ -21,6 +21,7 @@
#include "llvm/MCA/Stages/DispatchStage.h"
#include "llvm/MCA/Stages/EntryStage.h"
#include "llvm/MCA/Stages/ExecuteStage.h"
#include "llvm/MCA/Stages/MicroOpQueueStage.h"
#include "llvm/MCA/Stages/RetireStage.h"
namespace llvm {
@ -55,6 +56,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
// Build the pipeline.
auto StagePipeline = llvm::make_unique<Pipeline>();
StagePipeline->appendStage(std::move(Fetch));
if (Opts.MicroOpQueueSize)
StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
Opts.MicroOpQueueSize, Opts.DecodersThroughput));
StagePipeline->appendStage(std::move(Dispatch));
StagePipeline->appendStage(std::move(Execute));
StagePipeline->appendStage(std::move(Retire));

View File

@ -0,0 +1,70 @@
//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file defines the MicroOpQueueStage.
///
//===----------------------------------------------------------------------===//
#include "llvm/MCA/Stages/MicroOpQueueStage.h"
namespace llvm {
namespace mca {
#define DEBUG_TYPE "llvm-mca"
Error MicroOpQueueStage::moveInstructions() {
InstRef IR = Buffer[CurrentInstructionSlotIdx];
while (IR && checkNextStage(IR)) {
if (llvm::Error Val = moveToTheNextStage(IR))
return Val;
Buffer[CurrentInstructionSlotIdx].invalidate();
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
CurrentInstructionSlotIdx += NormalizedOpcodes;
CurrentInstructionSlotIdx %= Buffer.size();
AvailableEntries += NormalizedOpcodes;
IR = Buffer[CurrentInstructionSlotIdx];
}
return llvm::ErrorSuccess();
}
MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
bool ZeroLatencyStage)
: NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
Buffer.resize(Size ? Size : 1);
AvailableEntries = Buffer.size();
}
Error MicroOpQueueStage::execute(InstRef &IR) {
Buffer[NextAvailableSlotIdx] = IR;
unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
NextAvailableSlotIdx += NormalizedOpcodes;
NextAvailableSlotIdx %= Buffer.size();
AvailableEntries -= NormalizedOpcodes;
++CurrentIPC;
return llvm::ErrorSuccess();
}
Error MicroOpQueueStage::cycleStart() {
CurrentIPC = 0;
if (!IsZeroLatencyStage)
return moveInstructions();
return llvm::ErrorSuccess();
}
Error MicroOpQueueStage::cycleEnd() {
if (IsZeroLatencyStage)
return moveInstructions();
return llvm::ErrorSuccess();
}
} // namespace mca
} // namespace llvm

View File

@ -0,0 +1,105 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
add %eax, %eax
add %ebx, %ebx
add %ecx, %ecx
add %edx, %edx
# BTVER2-DEC-2: Iterations: 1500
# BTVER2-DEC-2-NEXT: Instructions: 6000
# BTVER2-DEC-2-NEXT: Total Cycles: 3003
# BTVER2-DEC-2-NEXT: Total uOps: 6000
# BTVER2-DEC-2: Dispatch Width: 2
# BTVER2-DEC-2-NEXT: uOps Per Cycle: 2.00
# BTVER2-DEC-2-NEXT: IPC: 2.00
# BTVER2-DEC-2-NEXT: Block RThroughput: 2.0
# BTVER2-DEC-1: Iterations: 1500
# BTVER2-DEC-1-NEXT: Instructions: 6000
# BTVER2-DEC-1-NEXT: Total Cycles: 6003
# BTVER2-DEC-1-NEXT: Total uOps: 6000
# BTVER2-UOPQ-1: Iterations: 1500
# BTVER2-UOPQ-1-NEXT: Instructions: 6000
# BTVER2-UOPQ-1-NEXT: Total Cycles: 6003
# BTVER2-UOPQ-1-NEXT: Total uOps: 6000
# BTVER2-UOPQ-2: Iterations: 1500
# BTVER2-UOPQ-2-NEXT: Instructions: 6000
# BTVER2-UOPQ-2-NEXT: Total Cycles: 3003
# BTVER2-UOPQ-2-NEXT: Total uOps: 6000
# HASWELL-DEC-2: Iterations: 1500
# HASWELL-DEC-2-NEXT: Instructions: 6000
# HASWELL-DEC-2-NEXT: Total Cycles: 3003
# HASWELL-DEC-2-NEXT: Total uOps: 6000
# HASWELL-UOPQ-1: Iterations: 1500
# HASWELL-UOPQ-1-NEXT: Instructions: 6000
# HASWELL-UOPQ-1-NEXT: Total Cycles: 6003
# HASWELL-UOPQ-1-NEXT: Total uOps: 6000
# HASWELL-UOPQ-2: Iterations: 1500
# HASWELL-UOPQ-2-NEXT: Instructions: 6000
# HASWELL-UOPQ-2-NEXT: Total Cycles: 3003
# HASWELL-UOPQ-2-NEXT: Total uOps: 6000
# HASWELL-UOPQ-3: Iterations: 1500
# HASWELL-UOPQ-3-NEXT: Instructions: 6000
# HASWELL-UOPQ-3-NEXT: Total Cycles: 2003
# HASWELL-UOPQ-3-NEXT: Total uOps: 6000
# HASWELL-UOPQ-4: Iterations: 1500
# HASWELL-UOPQ-4-NEXT: Instructions: 6000
# HASWELL-UOPQ-4-NEXT: Total Cycles: 1503
# HASWELL-UOPQ-4-NEXT: Total uOps: 6000
# BTVER2-DEC-1: Dispatch Width: 2
# BTVER2-DEC-1-NEXT: uOps Per Cycle: 1.00
# BTVER2-DEC-1-NEXT: IPC: 1.00
# BTVER2-DEC-1-NEXT: Block RThroughput: 2.0
# BTVER2-UOPQ-1: Dispatch Width: 2
# BTVER2-UOPQ-1-NEXT: uOps Per Cycle: 1.00
# BTVER2-UOPQ-1-NEXT: IPC: 1.00
# BTVER2-UOPQ-1-NEXT: Block RThroughput: 2.0
# BTVER2-UOPQ-2: Dispatch Width: 2
# BTVER2-UOPQ-2-NEXT: uOps Per Cycle: 2.00
# BTVER2-UOPQ-2-NEXT: IPC: 2.00
# BTVER2-UOPQ-2-NEXT: Block RThroughput: 2.0
# HASWELL-DEC-2: Dispatch Width: 4
# HASWELL-DEC-2-NEXT: uOps Per Cycle: 2.00
# HASWELL-DEC-2-NEXT: IPC: 2.00
# HASWELL-DEC-2-NEXT: Block RThroughput: 1.0
# HASWELL-UOPQ-1: Dispatch Width: 4
# HASWELL-UOPQ-1-NEXT: uOps Per Cycle: 1.00
# HASWELL-UOPQ-1-NEXT: IPC: 1.00
# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
# HASWELL-UOPQ-2: Dispatch Width: 4
# HASWELL-UOPQ-2-NEXT: uOps Per Cycle: 2.00
# HASWELL-UOPQ-2-NEXT: IPC: 2.00
# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
# HASWELL-UOPQ-3: Dispatch Width: 4
# HASWELL-UOPQ-3-NEXT: uOps Per Cycle: 3.00
# HASWELL-UOPQ-3-NEXT: IPC: 3.00
# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
# HASWELL-UOPQ-4: Dispatch Width: 4
# HASWELL-UOPQ-4-NEXT: uOps Per Cycle: 3.99
# HASWELL-UOPQ-4-NEXT: IPC: 3.99
# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0

View File

@ -100,6 +100,17 @@ static cl::opt<unsigned>
"be used for register mappings"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<unsigned>
MicroOpQueue("micro-op-queue-size", cl::Hidden,
cl::desc("Number of entries in the micro-op queue"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<unsigned>
DecoderThroughput("decoder-throughput", cl::Hidden,
cl::desc("Maximum throughput from the decoders "
"(instructions per cycle)"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<bool>
PrintRegisterFileStats("register-file-stats",
cl::desc("Print register file statistics"),
@ -387,9 +398,9 @@ int main(int argc, char **argv) {
// Create a context to control ownership of the pipeline hardware.
mca::Context MCA(*MRI, *STI);
mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
StoreQueueSize, AssumeNoAlias,
EnableBottleneckAnalysis);
mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
RegisterFileSize, LoadQueueSize, StoreQueueSize,
AssumeNoAlias, EnableBottleneckAnalysis);
// Number each region in the sequence.
unsigned RegionIdx = 0;