[MCA] Add an experimental MicroOpQueue stage.

This patch adds an experimental stage named MicroOpQueueStage. MicroOpQueueStage can be used to simulate a hardware micro-op queue (basically, a decoupling queue between 'decode' and 'dispatch'). Users can specify a queue size, as well as a optional MaxIPC (which - in the absence of a "Decoders" stage - can be used to simulate a different throughput from the decoders). This stage is added to the default pipeline between the EntryStage and the DispatchStage only if PipelineOption::MicroOpQueue is different than zero. By default, llvm-mca sets PipelineOption::MicroOpQueue to the value of hidden flag -micro-op-queue-size. Throughput from the decoder can be simulated via another hidden flag named -decoder-throughput. That flag allows us to quickly experiment with different frontend throughputs. For targets that declare a loop buffer, flag -decoder-throughput allows users to do multiple runs, each time simulating a different throughput from the decoders. This stage can/will be extended in future. For example, we could add a "buffer full" event to notify bottlenecks caused by backpressure. flag -decoder-throughput would probably go away if in future we delegate to another stage (DecoderStage?) the simulation of a (potentially variable) throughput from the decoders. For now, flag -decoder-throughput is "good enough" to run some simple experiments. Differential Revision: https://reviews.llvm.org/D59928 llvm-svn: 357248
2019-03-29 12:15:37 +00:00 · 2019-03-29 12:15:37 +00:00 · e074ac60b4
parent 2b766ed774
commit e074ac60b4
7 changed files with 289 additions and 6 deletions
--- a/llvm/include/llvm/MCA/Context.h
+++ b/llvm/include/llvm/MCA/Context.h
@ -31,11 +31,15 @@ namespace mca {
 /// This is a convenience struct to hold the parameters necessary for creating
 /// the pre-built "default" out-of-order pipeline.
 struct PipelineOptions {
-  PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
-                  bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
-      : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
+  PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
+                  unsigned LQS, unsigned SQS, bool NoAlias,
+                  bool ShouldEnableBottleneckAnalysis = false)
+      : MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
+        DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
        StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
        EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
+  unsigned MicroOpQueueSize;
+  unsigned DecodersThroughput; // Instructions per cycle.
  unsigned DispatchWidth;
  unsigned RegisterFileSize;
  unsigned LoadQueueSize;
--- a/llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h
+++ b/llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h
@ -0,0 +1,88 @@
+//===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage that implements a queue of micro opcodes.
+/// It can be used to simulate a hardware micro-op queue that serves opcodes to
+/// the out of order backend.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+#define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Stages/Stage.h"
+
+namespace llvm {
+namespace mca {
+
+/// A stage that simulates a queue of instruction opcodes.
+class MicroOpQueueStage : public Stage {
+  SmallVector<InstRef, 8> Buffer;
+  unsigned NextAvailableSlotIdx;
+  unsigned CurrentInstructionSlotIdx;
+
+  // Limits the number of instructions that can be written to this buffer every
+  // cycle. A value of zero means that there is no limit to the instruction
+  // throughput in input.
+  const unsigned MaxIPC;
+  unsigned CurrentIPC;
+
+  // Number of entries that are available during this cycle.
+  unsigned AvailableEntries;
+
+  // True if instructions dispatched to this stage don't need to wait for the
+  // next cycle before moving to the next stage.
+  // False if this buffer acts as a one cycle delay in the execution pipeline.
+  bool IsZeroLatencyStage;
+
+  MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
+  MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
+
+  // By default, an instruction consumes a number of buffer entries equal to its
+  // number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`).  The
+  // number of entries consumed by an instruction is normalized to the
+  // minimum value between NumMicroOpcodes and the buffer size. This is to avoid
+  // problems with (microcoded) instructions that generate a number of micro
+  // opcodes than doesn't fit in the buffer.
+  unsigned getNormalizedOpcodes(const InstRef &IR) const {
+    unsigned NormalizedOpcodes =
+        std::min(static_cast<unsigned>(Buffer.size()),
+                 IR.getInstruction()->getDesc().NumMicroOps);
+    return NormalizedOpcodes ? NormalizedOpcodes : 1U;
+  }
+
+  Error moveInstructions();
+
+public:
+  MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
+                    bool ZeroLatencyStage = true);
+
+  bool isAvailable(const InstRef &IR) const override {
+    if (MaxIPC && CurrentIPC == MaxIPC)
+      return false;
+    unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+    if (NormalizedOpcodes > AvailableEntries)
+      return false;
+    return true;
+  }
+
+  bool hasWorkToComplete() const override {
+    return AvailableEntries != Buffer.size();
+  }
+
+  Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error cycleEnd() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
--- a/llvm/lib/MCA/CMakeLists.txt
+++ b/llvm/lib/MCA/CMakeLists.txt
@ -14,6 +14,7 @@ add_llvm_library(LLVMMCA
  Stages/EntryStage.cpp
  Stages/ExecuteStage.cpp
  Stages/InstructionTables.cpp
+  Stages/MicroOpQueueStage.cpp
  Stages/RetireStage.cpp
  Stages/Stage.cpp
  Support.cpp
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@ -21,6 +21,7 @@
 #include "llvm/MCA/Stages/DispatchStage.h"
 #include "llvm/MCA/Stages/EntryStage.h"
 #include "llvm/MCA/Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
 #include "llvm/MCA/Stages/RetireStage.h"

 namespace llvm {
@ -55,6 +56,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
  // Build the pipeline.
  auto StagePipeline = llvm::make_unique<Pipeline>();
  StagePipeline->appendStage(std::move(Fetch));
+  if (Opts.MicroOpQueueSize)
+    StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
+        Opts.MicroOpQueueSize, Opts.DecodersThroughput));
  StagePipeline->appendStage(std::move(Dispatch));
  StagePipeline->appendStage(std::move(Execute));
  StagePipeline->appendStage(std::move(Retire));
--- a/llvm/lib/MCA/Stages/MicroOpQueueStage.cpp
+++ b/llvm/lib/MCA/Stages/MicroOpQueueStage.cpp
@ -0,0 +1,70 @@
+//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the MicroOpQueueStage.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
+
+namespace llvm {
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+Error MicroOpQueueStage::moveInstructions() {
+  InstRef IR = Buffer[CurrentInstructionSlotIdx];
+  while (IR && checkNextStage(IR)) {
+    if (llvm::Error Val = moveToTheNextStage(IR))
+      return Val;
+
+    Buffer[CurrentInstructionSlotIdx].invalidate();
+    unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+    CurrentInstructionSlotIdx += NormalizedOpcodes;
+    CurrentInstructionSlotIdx %= Buffer.size();
+    AvailableEntries += NormalizedOpcodes;
+    IR = Buffer[CurrentInstructionSlotIdx];
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
+                                     bool ZeroLatencyStage)
+    : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
+      CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
+  Buffer.resize(Size ? Size : 1);
+  AvailableEntries = Buffer.size();
+}
+
+Error MicroOpQueueStage::execute(InstRef &IR) {
+  Buffer[NextAvailableSlotIdx] = IR;
+  unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+  NextAvailableSlotIdx += NormalizedOpcodes;
+  NextAvailableSlotIdx %= Buffer.size();
+  AvailableEntries -= NormalizedOpcodes;
+  ++CurrentIPC;
+  return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleStart() {
+  CurrentIPC = 0;
+  if (!IsZeroLatencyStage)
+    return moveInstructions();
+  return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleEnd() {
+  if (IsZeroLatencyStage)
+    return moveInstructions();
+  return llvm::ErrorSuccess();
+}
+
+} // namespace mca
+} // namespace llvm
--- a/llvm/test/tools/llvm-mca/X86/uop-queue.s
+++ b/llvm/test/tools/llvm-mca/X86/uop-queue.s
@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
+
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
+
+add %eax, %eax
+add %ebx, %ebx
+add %ecx, %ecx
+add %edx, %edx
+
+# BTVER2-DEC-2:        Iterations:        1500
+# BTVER2-DEC-2-NEXT:   Instructions:      6000
+# BTVER2-DEC-2-NEXT:   Total Cycles:      3003
+# BTVER2-DEC-2-NEXT:   Total uOps:        6000
+
+# BTVER2-DEC-2:        Dispatch Width:    2
+# BTVER2-DEC-2-NEXT:   uOps Per Cycle:    2.00
+# BTVER2-DEC-2-NEXT:   IPC:               2.00
+# BTVER2-DEC-2-NEXT:   Block RThroughput: 2.0
+
+# BTVER2-DEC-1:        Iterations:        1500
+# BTVER2-DEC-1-NEXT:   Instructions:      6000
+# BTVER2-DEC-1-NEXT:   Total Cycles:      6003
+# BTVER2-DEC-1-NEXT:   Total uOps:        6000
+
+# BTVER2-UOPQ-1:       Iterations:        1500
+# BTVER2-UOPQ-1-NEXT:  Instructions:      6000
+# BTVER2-UOPQ-1-NEXT:  Total Cycles:      6003
+# BTVER2-UOPQ-1-NEXT:  Total uOps:        6000
+
+# BTVER2-UOPQ-2:       Iterations:        1500
+# BTVER2-UOPQ-2-NEXT:  Instructions:      6000
+# BTVER2-UOPQ-2-NEXT:  Total Cycles:      3003
+# BTVER2-UOPQ-2-NEXT:  Total uOps:        6000
+
+# HASWELL-DEC-2:       Iterations:        1500
+# HASWELL-DEC-2-NEXT:  Instructions:      6000
+# HASWELL-DEC-2-NEXT:  Total Cycles:      3003
+# HASWELL-DEC-2-NEXT:  Total uOps:        6000
+
+# HASWELL-UOPQ-1:      Iterations:        1500
+# HASWELL-UOPQ-1-NEXT: Instructions:      6000
+# HASWELL-UOPQ-1-NEXT: Total Cycles:      6003
+# HASWELL-UOPQ-1-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-2:      Iterations:        1500
+# HASWELL-UOPQ-2-NEXT: Instructions:      6000
+# HASWELL-UOPQ-2-NEXT: Total Cycles:      3003
+# HASWELL-UOPQ-2-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-3:      Iterations:        1500
+# HASWELL-UOPQ-3-NEXT: Instructions:      6000
+# HASWELL-UOPQ-3-NEXT: Total Cycles:      2003
+# HASWELL-UOPQ-3-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-4:      Iterations:        1500
+# HASWELL-UOPQ-4-NEXT: Instructions:      6000
+# HASWELL-UOPQ-4-NEXT: Total Cycles:      1503
+# HASWELL-UOPQ-4-NEXT: Total uOps:        6000
+
+# BTVER2-DEC-1:        Dispatch Width:    2
+# BTVER2-DEC-1-NEXT:   uOps Per Cycle:    1.00
+# BTVER2-DEC-1-NEXT:   IPC:               1.00
+# BTVER2-DEC-1-NEXT:   Block RThroughput: 2.0
+
+# BTVER2-UOPQ-1:       Dispatch Width:    2
+# BTVER2-UOPQ-1-NEXT:  uOps Per Cycle:    1.00
+# BTVER2-UOPQ-1-NEXT:  IPC:               1.00
+# BTVER2-UOPQ-1-NEXT:  Block RThroughput: 2.0
+
+# BTVER2-UOPQ-2:       Dispatch Width:    2
+# BTVER2-UOPQ-2-NEXT:  uOps Per Cycle:    2.00
+# BTVER2-UOPQ-2-NEXT:  IPC:               2.00
+# BTVER2-UOPQ-2-NEXT:  Block RThroughput: 2.0
+
+# HASWELL-DEC-2:       Dispatch Width:    4
+# HASWELL-DEC-2-NEXT:  uOps Per Cycle:    2.00
+# HASWELL-DEC-2-NEXT:  IPC:               2.00
+# HASWELL-DEC-2-NEXT:  Block RThroughput: 1.0
+
+# HASWELL-UOPQ-1:      Dispatch Width:    4
+# HASWELL-UOPQ-1-NEXT: uOps Per Cycle:    1.00
+# HASWELL-UOPQ-1-NEXT: IPC:               1.00
+# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-2:      Dispatch Width:    4
+# HASWELL-UOPQ-2-NEXT: uOps Per Cycle:    2.00
+# HASWELL-UOPQ-2-NEXT: IPC:               2.00
+# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-3:      Dispatch Width:    4
+# HASWELL-UOPQ-3-NEXT: uOps Per Cycle:    3.00
+# HASWELL-UOPQ-3-NEXT: IPC:               3.00
+# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-4:      Dispatch Width:    4
+# HASWELL-UOPQ-4-NEXT: uOps Per Cycle:    3.99
+# HASWELL-UOPQ-4-NEXT: IPC:               3.99
+# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@ -100,6 +100,17 @@ static cl::opt<unsigned>
                              "be used for register mappings"),
                     cl::cat(ToolOptions), cl::init(0));

+static cl::opt<unsigned>
+    MicroOpQueue("micro-op-queue-size", cl::Hidden,
+                 cl::desc("Number of entries in the micro-op queue"),
+                 cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+    DecoderThroughput("decoder-throughput", cl::Hidden,
+                      cl::desc("Maximum throughput from the decoders "
+                               "(instructions per cycle)"),
+                      cl::cat(ToolOptions), cl::init(0));
+
 static cl::opt<bool>
    PrintRegisterFileStats("register-file-stats",
                           cl::desc("Print register file statistics"),
@ -387,9 +398,9 @@ int main(int argc, char **argv) {
  // Create a context to control ownership of the pipeline hardware.
  mca::Context MCA(*MRI, *STI);

-  mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
-                          StoreQueueSize, AssumeNoAlias,
-                          EnableBottleneckAnalysis);
+  mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
+                          RegisterFileSize, LoadQueueSize, StoreQueueSize,
+                          AssumeNoAlias, EnableBottleneckAnalysis);

  // Number each region in the sequence.
  unsigned RegionIdx = 0;