forked from OSchip/llvm-project
[llvm-exegesis] Add throughput mode.
Summary: This just uses the latency benchmark runner on the parallel uops snippet generator. Fixes PR37698. Reviewers: gchatelet Subscribers: tschuett, RKSimon, llvm-commits Differential Revision: https://reviews.llvm.org/D57000 llvm-svn: 352632
This commit is contained in:
parent
4f9e3ce070
commit
362653f7af
|
@ -10,13 +10,13 @@ DESCRIPTION
|
|||
-----------
|
||||
|
||||
:program:`llvm-exegesis` is a benchmarking tool that uses information available
|
||||
in LLVM to measure host machine instruction characteristics like latency or port
|
||||
decomposition.
|
||||
in LLVM to measure host machine instruction characteristics like latency,
|
||||
throughput, or port decomposition.
|
||||
|
||||
Given an LLVM opcode name and a benchmarking mode, :program:`llvm-exegesis`
|
||||
generates a code snippet that makes execution as serial (resp. as parallel) as
|
||||
possible so that we can measure the latency (resp. uop decomposition) of the
|
||||
instruction.
|
||||
possible so that we can measure the latency (resp. inverse throughput/uop decomposition)
|
||||
of the instruction.
|
||||
The code snippet is jitted and executed on the host subtarget. The time taken
|
||||
(resp. resource usage) is measured using hardware performance counters. The
|
||||
result is printed out as YAML to the standard output.
|
||||
|
@ -37,11 +37,13 @@ instruction, run:
|
|||
|
||||
$ llvm-exegesis -mode=latency -opcode-name=ADD64rr
|
||||
|
||||
Measuring the uop decomposition of an instruction works similarly:
|
||||
Measuring the uop decomposition or inverse throughput of an instruction works similarly:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ llvm-exegesis -mode=uops -opcode-name=ADD64rr
|
||||
$ llvm-exegesis -mode=inverse_throughput -opcode-name=ADD64rr
|
||||
|
||||
|
||||
The output is a YAML document (the default is to write to stdout, but you can
|
||||
redirect the output to a file using `-benchmarks-file`):
|
||||
|
@ -186,7 +188,7 @@ OPTIONS
|
|||
Specify the custom code snippet to measure. See example 2 for details.
|
||||
Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
|
||||
|
||||
.. option:: -mode=[latency|uops|analysis]
|
||||
.. option:: -mode=[latency|uops|inverse_throughput|analysis]
|
||||
|
||||
Specify the run mode.
|
||||
|
||||
|
@ -197,8 +199,8 @@ OPTIONS
|
|||
|
||||
.. option:: -benchmarks-file=</path/to/file>
|
||||
|
||||
File to read (`analysis` mode) or write (`latency`/`uops` modes) benchmark
|
||||
results. "-" uses stdin/stdout.
|
||||
File to read (`analysis` mode) or write (`latency`/`uops`/`inverse_throughput`
|
||||
modes) benchmark results. "-" uses stdin/stdout.
|
||||
|
||||
.. option:: -analysis-clusters-output-file=</path/to/file>
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
# RUN: llvm-exegesis -mode=inverse_throughput -opcode-name=ADD32rr | FileCheck %s
|
||||
|
||||
CHECK: ---
|
||||
CHECK-NEXT: mode: inverse_throughput
|
||||
CHECK-NEXT: key:
|
||||
CHECK-NEXT: instructions:
|
||||
CHECK-NEXT: ADD32rr
|
||||
CHECK: key: inverse_throughput
|
|
@ -209,6 +209,8 @@ struct ScalarEnumerationTraits<exegesis::InstructionBenchmark::ModeE> {
|
|||
Io.enumCase(Value, "", exegesis::InstructionBenchmark::Unknown);
|
||||
Io.enumCase(Value, "latency", exegesis::InstructionBenchmark::Latency);
|
||||
Io.enumCase(Value, "uops", exegesis::InstructionBenchmark::Uops);
|
||||
Io.enumCase(Value, "inverse_throughput",
|
||||
exegesis::InstructionBenchmark::InverseThroughput);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ struct BenchmarkMeasure {
|
|||
// The result of an instruction benchmark.
|
||||
struct InstructionBenchmark {
|
||||
InstructionBenchmarkKey Key;
|
||||
enum ModeE { Unknown, Latency, Uops };
|
||||
enum ModeE { Unknown, Latency, Uops, InverseThroughput };
|
||||
ModeE Mode;
|
||||
std::string CpuName;
|
||||
std::string LLVMTriple;
|
||||
|
|
|
@ -75,6 +75,7 @@ public:
|
|||
|
||||
protected:
|
||||
const LLVMState &State;
|
||||
const InstructionBenchmark::ModeE Mode;
|
||||
|
||||
private:
|
||||
virtual llvm::Expected<std::vector<BenchmarkMeasure>>
|
||||
|
@ -84,7 +85,6 @@ private:
|
|||
writeObjectFile(const BenchmarkCode &Configuration,
|
||||
llvm::ArrayRef<llvm::MCInst> Code) const;
|
||||
|
||||
const InstructionBenchmark::ModeE Mode;
|
||||
|
||||
const std::unique_ptr<ScratchSpace> Scratch;
|
||||
};
|
||||
|
|
|
@ -165,6 +165,14 @@ LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
|
|||
return std::move(Results);
|
||||
}
|
||||
|
||||
LatencyBenchmarkRunner::LatencyBenchmarkRunner(const LLVMState &State,
|
||||
InstructionBenchmark::ModeE Mode)
|
||||
: BenchmarkRunner(State, Mode) {
|
||||
assert((Mode == InstructionBenchmark::Latency ||
|
||||
Mode == InstructionBenchmark::InverseThroughput) &&
|
||||
"invalid mode");
|
||||
}
|
||||
|
||||
LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
|
||||
|
||||
llvm::Expected<std::vector<BenchmarkMeasure>>
|
||||
|
@ -184,8 +192,17 @@ LatencyBenchmarkRunner::runMeasurements(
|
|||
if (*ExpectedCounterValue < MinValue)
|
||||
MinValue = *ExpectedCounterValue;
|
||||
}
|
||||
std::vector<BenchmarkMeasure> Result = {
|
||||
BenchmarkMeasure::Create("latency", MinValue)};
|
||||
std::vector<BenchmarkMeasure> Result;
|
||||
switch (Mode) {
|
||||
case InstructionBenchmark::Latency:
|
||||
Result = {BenchmarkMeasure::Create("latency", MinValue)};
|
||||
break;
|
||||
case InstructionBenchmark::InverseThroughput:
|
||||
Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return std::move(Result);
|
||||
}
|
||||
|
||||
|
|
|
@ -32,8 +32,8 @@ public:
|
|||
|
||||
class LatencyBenchmarkRunner : public BenchmarkRunner {
|
||||
public:
|
||||
LatencyBenchmarkRunner(const LLVMState &State)
|
||||
: BenchmarkRunner(State, InstructionBenchmark::Latency) {}
|
||||
LatencyBenchmarkRunner(const LLVMState &State,
|
||||
InstructionBenchmark::ModeE Mode);
|
||||
~LatencyBenchmarkRunner() override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -45,6 +45,7 @@ ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode,
|
|||
case InstructionBenchmark::Latency:
|
||||
return createLatencySnippetGenerator(State);
|
||||
case InstructionBenchmark::Uops:
|
||||
case InstructionBenchmark::InverseThroughput:
|
||||
return createUopsSnippetGenerator(State);
|
||||
}
|
||||
return nullptr;
|
||||
|
@ -57,7 +58,8 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode,
|
|||
case InstructionBenchmark::Unknown:
|
||||
return nullptr;
|
||||
case InstructionBenchmark::Latency:
|
||||
return createLatencyBenchmarkRunner(State);
|
||||
case InstructionBenchmark::InverseThroughput:
|
||||
return createLatencyBenchmarkRunner(State, Mode);
|
||||
case InstructionBenchmark::Uops:
|
||||
return createUopsBenchmarkRunner(State);
|
||||
}
|
||||
|
@ -74,9 +76,9 @@ ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const {
|
|||
return llvm::make_unique<UopsSnippetGenerator>(State);
|
||||
}
|
||||
|
||||
std::unique_ptr<BenchmarkRunner>
|
||||
ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const {
|
||||
return llvm::make_unique<LatencyBenchmarkRunner>(State);
|
||||
std::unique_ptr<BenchmarkRunner> ExegesisTarget::createLatencyBenchmarkRunner(
|
||||
const LLVMState &State, InstructionBenchmark::ModeE Mode) const {
|
||||
return llvm::make_unique<LatencyBenchmarkRunner>(State, Mode);
|
||||
}
|
||||
|
||||
std::unique_ptr<BenchmarkRunner>
|
||||
|
|
|
@ -130,7 +130,7 @@ private:
|
|||
std::unique_ptr<SnippetGenerator> virtual createUopsSnippetGenerator(
|
||||
const LLVMState &State) const;
|
||||
std::unique_ptr<BenchmarkRunner> virtual createLatencyBenchmarkRunner(
|
||||
const LLVMState &State) const;
|
||||
const LLVMState &State, InstructionBenchmark::ModeE Mode) const;
|
||||
std::unique_ptr<BenchmarkRunner> virtual createUopsBenchmarkRunner(
|
||||
const LLVMState &State) const;
|
||||
|
||||
|
|
|
@ -56,16 +56,19 @@ static cl::opt<std::string> SnippetsFile("snippets-file",
|
|||
static cl::opt<std::string> BenchmarkFile("benchmarks-file", cl::desc(""),
|
||||
cl::init(""));
|
||||
|
||||
static cl::opt<exegesis::InstructionBenchmark::ModeE>
|
||||
BenchmarkMode("mode", cl::desc("the mode to run"),
|
||||
cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency,
|
||||
"latency", "Instruction Latency"),
|
||||
clEnumValN(exegesis::InstructionBenchmark::Uops,
|
||||
"uops", "Uop Decomposition"),
|
||||
// When not asking for a specific benchmark mode,
|
||||
// we'll analyse the results.
|
||||
clEnumValN(exegesis::InstructionBenchmark::Unknown,
|
||||
"analysis", "Analysis")));
|
||||
static cl::opt<exegesis::InstructionBenchmark::ModeE> BenchmarkMode(
|
||||
"mode", cl::desc("the mode to run"),
|
||||
cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency",
|
||||
"Instruction Latency"),
|
||||
clEnumValN(exegesis::InstructionBenchmark::InverseThroughput,
|
||||
"inverse_throughput",
|
||||
"Instruction Inverse Throughput"),
|
||||
clEnumValN(exegesis::InstructionBenchmark::Uops, "uops",
|
||||
"Uop Decomposition"),
|
||||
// When not asking for a specific benchmark mode,
|
||||
// we'll analyse the results.
|
||||
clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis",
|
||||
"Analysis")));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
NumRepetitions("num-repetitions",
|
||||
|
|
Loading…
Reference in New Issue