2018-04-04 19:37:06 +08:00
|
|
|
//===-- llvm-exegesis.cpp ---------------------------------------*- C++ -*-===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-04-04 19:37:06 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
///
|
|
|
|
/// \file
|
|
|
|
/// Measures execution properties (latencies/uops) of an instruction.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2018-05-15 20:08:00 +08:00
|
|
|
#include "lib/Analysis.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "lib/BenchmarkResult.h"
|
|
|
|
#include "lib/BenchmarkRunner.h"
|
2018-05-15 20:08:00 +08:00
|
|
|
#include "lib/Clustering.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "lib/LlvmState.h"
|
|
|
|
#include "lib/PerfHelper.h"
|
2018-06-26 16:49:30 +08:00
|
|
|
#include "lib/Target.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "llvm/ADT/StringExtras.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
|
|
|
#include "llvm/MC/MCInstBuilder.h"
|
2018-09-25 15:31:44 +08:00
|
|
|
#include "llvm/MC/MCObjectFileInfo.h"
|
|
|
|
#include "llvm/MC/MCParser/MCAsmParser.h"
|
|
|
|
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "llvm/MC/MCRegisterInfo.h"
|
2018-09-25 15:31:44 +08:00
|
|
|
#include "llvm/MC/MCStreamer.h"
|
2018-05-15 20:08:00 +08:00
|
|
|
#include "llvm/MC/MCSubtargetInfo.h"
|
2018-09-25 15:31:44 +08:00
|
|
|
#include "llvm/Object/ObjectFile.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2018-05-15 20:08:00 +08:00
|
|
|
#include "llvm/Support/Format.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "llvm/Support/Path.h"
|
2018-09-25 15:31:44 +08:00
|
|
|
#include "llvm/Support/SourceMgr.h"
|
2018-05-15 20:08:00 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
2018-04-04 19:37:06 +08:00
|
|
|
#include "llvm/Support/TargetSelect.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <string>
|
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
namespace llvm {
|
|
|
|
namespace exegesis {
|
|
|
|
|
|
|
|
static cl::opt<int> OpcodeIndex("opcode-index",
|
|
|
|
cl::desc("opcode to measure, by index"),
|
|
|
|
cl::init(0));
|
|
|
|
|
|
|
|
static cl::opt<std::string>
|
|
|
|
OpcodeNames("opcode-name",
|
|
|
|
cl::desc("comma-separated list of opcodes to measure, by name"),
|
|
|
|
cl::init(""));
|
|
|
|
|
|
|
|
static cl::opt<std::string> SnippetsFile("snippets-file",
|
|
|
|
cl::desc("code snippets to measure"),
|
|
|
|
cl::init(""));
|
|
|
|
|
|
|
|
static cl::opt<std::string> BenchmarkFile("benchmarks-file", cl::desc(""),
|
|
|
|
cl::init(""));
|
|
|
|
|
2019-01-31 00:02:20 +08:00
|
|
|
static cl::opt<exegesis::InstructionBenchmark::ModeE> BenchmarkMode(
|
|
|
|
"mode", cl::desc("the mode to run"),
|
|
|
|
cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency",
|
|
|
|
"Instruction Latency"),
|
|
|
|
clEnumValN(exegesis::InstructionBenchmark::InverseThroughput,
|
|
|
|
"inverse_throughput",
|
|
|
|
"Instruction Inverse Throughput"),
|
|
|
|
clEnumValN(exegesis::InstructionBenchmark::Uops, "uops",
|
|
|
|
"Uop Decomposition"),
|
|
|
|
// When not asking for a specific benchmark mode,
|
|
|
|
// we'll analyse the results.
|
|
|
|
clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis",
|
|
|
|
"Analysis")));
|
2018-10-23 01:10:47 +08:00
|
|
|
|
|
|
|
static cl::opt<unsigned>
|
2018-04-04 19:37:06 +08:00
|
|
|
NumRepetitions("num-repetitions",
|
2018-10-23 01:10:47 +08:00
|
|
|
cl::desc("number of time to repeat the asm snippet"),
|
|
|
|
cl::init(10000));
|
2018-04-04 19:37:06 +08:00
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
static cl::opt<bool> IgnoreInvalidSchedClass(
|
2018-06-18 19:27:47 +08:00
|
|
|
"ignore-invalid-sched-class",
|
2018-10-23 01:10:47 +08:00
|
|
|
cl::desc("ignore instructions that do not define a sched class"),
|
|
|
|
cl::init(false));
|
2018-06-18 19:27:47 +08:00
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
static cl::opt<unsigned> AnalysisNumPoints(
|
2018-05-15 20:08:00 +08:00
|
|
|
"analysis-numpoints",
|
2018-10-23 01:10:47 +08:00
|
|
|
cl::desc("minimum number of points in an analysis cluster"), cl::init(3));
|
2018-05-15 20:08:00 +08:00
|
|
|
|
[llvm-exegesis] Split Epsilon param into two (PR40787)
Summary:
This eps param is used for two distinct things:
* initial point clusterization
* checking clusters against the llvm values
What if one wants to only look at highly different clusters, without changing
the clustering itself? In particular, this helps to weed out noisy measurements
(since the clusterization epsilon is still small, so there is a better chance
that noisy measurements from the same opcode will go into different clusters)
By splitting it into two params it is now possible.
This is nearly-free performance-wise:
Old:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
390.01 msec task-clock # 0.998 CPUs utilized ( +- 0.25% )
12 context-switches # 31.735 M/sec ( +- 27.38% )
0 cpu-migrations # 0.000 K/sec
4745 page-faults # 12183.732 M/sec ( +- 0.54% )
1562711900 cycles # 4012303.327 GHz ( +- 0.24% ) (82.90%)
185567822 stalled-cycles-frontend # 11.87% frontend cycles idle ( +- 0.52% ) (83.30%)
392106234 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.31% ) (33.79%)
1839236666 instructions # 1.18 insn per cycle
# 0.21 stalled cycles per insn ( +- 0.15% ) (50.37%)
407035764 branches # 1045074878.710 M/sec ( +- 0.12% ) (66.80%)
10896459 branch-misses # 2.68% of all branches ( +- 0.17% ) (83.20%)
0.390629 +- 0.000972 seconds time elapsed ( +- 0.25% )
```
```
$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (9 runs):
6803.36 msec task-clock # 0.999 CPUs utilized ( +- 0.96% )
262 context-switches # 38.546 M/sec ( +- 23.06% )
0 cpu-migrations # 0.065 M/sec ( +- 76.03% )
13287 page-faults # 1953.206 M/sec ( +- 0.32% )
27252537904 cycles # 4006024.257 GHz ( +- 0.95% ) (83.31%)
1496314935 stalled-cycles-frontend # 5.49% frontend cycles idle ( +- 0.97% ) (83.32%)
16128404524 stalled-cycles-backend # 59.18% backend cycles idle ( +- 0.30% ) (33.37%)
17611143370 instructions # 0.65 insn per cycle
# 0.92 stalled cycles per insn ( +- 0.05% ) (50.04%)
3894906599 branches # 572537147.437 M/sec ( +- 0.03% ) (66.69%)
116314514 branch-misses # 2.99% of all branches ( +- 0.20% ) (83.35%)
6.8118 +- 0.0689 seconds time elapsed ( +- 1.01%)
```
New:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (25 runs):
400.14 msec task-clock # 0.998 CPUs utilized ( +- 0.66% )
12 context-switches # 29.429 M/sec ( +- 25.95% )
0 cpu-migrations # 0.100 M/sec ( +-100.00% )
4714 page-faults # 11796.496 M/sec ( +- 0.55% )
1603131306 cycles # 4011840.105 GHz ( +- 0.66% ) (82.85%)
199538509 stalled-cycles-frontend # 12.45% frontend cycles idle ( +- 2.40% ) (83.10%)
402249109 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.19% ) (34.05%)
1847783963 instructions # 1.15 insn per cycle
# 0.22 stalled cycles per insn ( +- 0.18% ) (50.64%)
407162722 branches # 1018925730.631 M/sec ( +- 0.12% ) (67.02%)
10932779 branch-misses # 2.69% of all branches ( +- 0.51% ) (83.28%)
0.40077 +- 0.00267 seconds time elapsed ( +- 0.67% )
lebedevri@pini-pini:/build/llvm-build-Clang-release$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (9 runs):
6947.79 msec task-clock # 1.000 CPUs utilized ( +- 0.90% )
217 context-switches # 31.236 M/sec ( +- 36.16% )
1 cpu-migrations # 0.096 M/sec ( +- 50.00% )
13258 page-faults # 1908.389 M/sec ( +- 0.34% )
27830796523 cycles # 4006032.286 GHz ( +- 0.89% ) (83.30%)
1504554006 stalled-cycles-frontend # 5.41% frontend cycles idle ( +- 2.10% ) (83.32%)
16716574843 stalled-cycles-backend # 60.07% backend cycles idle ( +- 0.65% ) (33.38%)
17755545931 instructions # 0.64 insn per cycle
# 0.94 stalled cycles per insn ( +- 0.09% ) (50.04%)
3897255686 branches # 560980426.597 M/sec ( +- 0.06% ) (66.70%)
117045395 branch-misses # 3.00% of all branches ( +- 0.47% ) (83.34%)
6.9507 +- 0.0627 seconds time elapsed ( +- 0.90% )
```
I.e. it's +2.6% slowdown for one whole sweep, or +2% for 5 whole sweeps.
Within noise i'd say.
Should help with [[ https://bugs.llvm.org/show_bug.cgi?id=40787 | PR40787 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, RKSimon, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58476
llvm-svn: 354767
2019-02-25 17:36:12 +08:00
|
|
|
static cl::opt<float> AnalysisClusteringEpsilon(
|
|
|
|
"analysis-clustering-epsilon",
|
|
|
|
cl::desc("dbscan epsilon for benchmark point clustering"), cl::init(0.1));
|
|
|
|
|
|
|
|
static cl::opt<float> AnalysisInconsistencyEpsilon(
|
|
|
|
"analysis-inconsistency-epsilon",
|
|
|
|
cl::desc("epsilon for detection of when the cluster is different from the "
|
|
|
|
"LLVM schedule profile values"),
|
|
|
|
cl::init(0.1));
|
2018-05-15 20:08:00 +08:00
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
static cl::opt<std::string>
|
|
|
|
AnalysisClustersOutputFile("analysis-clusters-output-file", cl::desc(""),
|
2019-02-04 17:12:08 +08:00
|
|
|
cl::init(""));
|
2018-10-23 01:10:47 +08:00
|
|
|
static cl::opt<std::string>
|
2018-05-17 21:41:28 +08:00
|
|
|
AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file",
|
2019-02-04 17:12:08 +08:00
|
|
|
cl::desc(""), cl::init(""));
|
2018-04-04 19:37:06 +08:00
|
|
|
|
[llvm-exegesis] Opcode stabilization / reclusterization (PR40715)
Summary:
Given an instruction `Opcode`, we can make benchmarks (measurements) of the
instruction characteristics/performance. Then, to facilitate further analysis
we group the benchmarks with *similar* characteristics into clusters.
Now, this is all not entirely deterministic. Some instructions have variable
characteristics, depending on their arguments. And thus, if we do several
benchmarks of the same instruction `Opcode`, we may end up with *different*
performance characteristics measurements. And when we then do clustering,
these several benchmarks of the same instruction `Opcode` may end up being
clustered into *different* clusters. This is not great for further analysis.
We shall find every `Opcode` with benchmarks not in just one cluster, and move
*all* the benchmarks of said `Opcode` into one new unstable cluster per `Opcode`.
I have solved this by making `ClusterId` a bit field, adding a `IsUnstable` bit,
and introducing `-analysis-display-unstable-clusters` switch to toggle between
displaying stable-only clusters and unstable-only clusters.
The reclusterization is deterministically stable, produces identical reports
between runs. (Or at least that is what i'm seeing, maybe it isn't)
Timings/comparisons:
old (current trunk/head) {F8303582}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
6624.73 msec task-clock # 0.999 CPUs utilized ( +- 0.53% )
172 context-switches # 25.965 M/sec ( +- 29.89% )
0 cpu-migrations # 0.042 M/sec ( +- 56.54% )
31073 page-faults # 4690.754 M/sec ( +- 0.08% )
26538711696 cycles # 4006230.292 GHz ( +- 0.53% ) (83.31%)
2017496807 stalled-cycles-frontend # 7.60% frontend cycles idle ( +- 0.93% ) (83.32%)
13403650062 stalled-cycles-backend # 50.51% backend cycles idle ( +- 0.33% ) (33.37%)
19770706799 instructions # 0.74 insn per cycle
# 0.68 stalled cycles per insn ( +- 0.04% ) (50.04%)
4419821812 branches # 667207369.714 M/sec ( +- 0.03% ) (66.69%)
121741669 branch-misses # 2.75% of all branches ( +- 0.28% ) (83.34%)
6.6283 +- 0.0358 seconds time elapsed ( +- 0.54% )
```
patch, with reclustering but without filtering (i.e. outputting all the stable *and* unstable clusters) {F8303586}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html' (25 runs):
6475.29 msec task-clock # 0.999 CPUs utilized ( +- 0.31% )
213 context-switches # 32.952 M/sec ( +- 23.81% )
1 cpu-migrations # 0.130 M/sec ( +- 43.84% )
31287 page-faults # 4832.057 M/sec ( +- 0.08% )
25939086577 cycles # 4006160.279 GHz ( +- 0.31% ) (83.31%)
1958812858 stalled-cycles-frontend # 7.55% frontend cycles idle ( +- 0.68% ) (83.32%)
13218961512 stalled-cycles-backend # 50.96% backend cycles idle ( +- 0.29% ) (33.37%)
19752995402 instructions # 0.76 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.04%)
4417079244 branches # 682195472.305 M/sec ( +- 0.03% ) (66.70%)
121510065 branch-misses # 2.75% of all branches ( +- 0.19% ) (83.34%)
6.4832 +- 0.0229 seconds time elapsed ( +- 0.35% )
```
Funnily, *this* measurement shows that said reclustering actually improved performance.
patch, with reclustering, only the stable clusters {F8303594}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html' (25 runs):
6387.71 msec task-clock # 0.999 CPUs utilized ( +- 0.13% )
133 context-switches # 20.792 M/sec ( +- 23.39% )
0 cpu-migrations # 0.063 M/sec ( +- 61.24% )
31318 page-faults # 4903.256 M/sec ( +- 0.08% )
25591984967 cycles # 4006786.266 GHz ( +- 0.13% ) (83.31%)
1881234904 stalled-cycles-frontend # 7.35% frontend cycles idle ( +- 0.25% ) (83.33%)
13209749965 stalled-cycles-backend # 51.62% backend cycles idle ( +- 0.16% ) (33.36%)
19767554347 instructions # 0.77 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.03%)
4417480305 branches # 691618858.046 M/sec ( +- 0.03% ) (66.68%)
118676358 branch-misses # 2.69% of all branches ( +- 0.07% ) (83.33%)
6.3954 +- 0.0118 seconds time elapsed ( +- 0.18% )
```
Performance improved even further?! Makes sense i guess, less clusters to print.
patch, with reclustering, only the unstable clusters {F8303601}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters' (25 runs):
6124.96 msec task-clock # 1.000 CPUs utilized ( +- 0.20% )
194 context-switches # 31.709 M/sec ( +- 20.46% )
0 cpu-migrations # 0.039 M/sec ( +- 49.77% )
31413 page-faults # 5129.261 M/sec ( +- 0.06% )
24536794267 cycles # 4006425.858 GHz ( +- 0.19% ) (83.31%)
1676085087 stalled-cycles-frontend # 6.83% frontend cycles idle ( +- 0.46% ) (83.32%)
13035595603 stalled-cycles-backend # 53.13% backend cycles idle ( +- 0.16% ) (33.36%)
18260877653 instructions # 0.74 insn per cycle
# 0.71 stalled cycles per insn ( +- 0.05% ) (50.03%)
4112411983 branches # 671484364.603 M/sec ( +- 0.03% ) (66.68%)
114066929 branch-misses # 2.77% of all branches ( +- 0.11% ) (83.32%)
6.1278 +- 0.0121 seconds time elapsed ( +- 0.20% )
```
This tells us that the actual `-analysis-inconsistencies-output-file=` outputting only takes ~0.4 sec for 43970 benchmark points (3 whole sweeps)
(Also, wow this is fast, it used to take several minutes originally)
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40715 | PR40715 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, jdoerfert, llvm-commits, RKSimon
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58355
llvm-svn: 354441
2019-02-20 17:14:04 +08:00
|
|
|
static cl::opt<bool> AnalysisDisplayUnstableOpcodes(
|
|
|
|
"analysis-display-unstable-clusters",
|
|
|
|
cl::desc("if there is more than one benchmark for an opcode, said "
|
|
|
|
"benchmarks may end up not being clustered into the same cluster "
|
|
|
|
"if the measured performance characteristics are different. by "
|
|
|
|
"default all such opcodes are filtered out. this flag will "
|
|
|
|
"instead show only such unstable opcodes"),
|
|
|
|
cl::init(false));
|
|
|
|
|
2018-10-25 15:44:01 +08:00
|
|
|
static cl::opt<std::string>
|
|
|
|
CpuName("mcpu",
|
|
|
|
cl::desc(
|
|
|
|
"cpu name to use for pfm counters, leave empty to autodetect"),
|
|
|
|
cl::init(""));
|
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
static ExitOnError ExitOnErr;
|
2018-06-07 15:51:16 +08:00
|
|
|
|
2018-06-19 19:28:59 +08:00
|
|
|
#ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET
|
|
|
|
void LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
|
|
|
|
#endif
|
|
|
|
|
2018-10-17 23:04:15 +08:00
|
|
|
// Checks that only one of OpcodeNames, OpcodeIndex or SnippetsFile is provided,
|
|
|
|
// and returns the opcode indices or {} if snippets should be read from
|
2018-09-25 15:31:44 +08:00
|
|
|
// `SnippetsFile`.
|
2018-10-17 23:04:15 +08:00
|
|
|
static std::vector<unsigned>
|
|
|
|
getOpcodesOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
|
|
|
|
const size_t NumSetFlags = (OpcodeNames.empty() ? 0 : 1) +
|
2018-09-25 15:31:44 +08:00
|
|
|
(OpcodeIndex == 0 ? 0 : 1) +
|
|
|
|
(SnippetsFile.empty() ? 0 : 1);
|
|
|
|
if (NumSetFlags != 1)
|
2018-05-17 18:52:18 +08:00
|
|
|
llvm::report_fatal_error(
|
2018-09-25 15:31:44 +08:00
|
|
|
"please provide one and only one of 'opcode-index', 'opcode-name' or "
|
|
|
|
"'snippets-file'");
|
|
|
|
if (!SnippetsFile.empty())
|
2018-10-17 23:04:15 +08:00
|
|
|
return {};
|
2018-05-17 18:52:18 +08:00
|
|
|
if (OpcodeIndex > 0)
|
2018-10-17 23:04:15 +08:00
|
|
|
return {static_cast<unsigned>(OpcodeIndex)};
|
|
|
|
if (OpcodeIndex < 0) {
|
|
|
|
std::vector<unsigned> Result;
|
2018-10-18 16:20:50 +08:00
|
|
|
for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
|
2018-10-17 23:04:15 +08:00
|
|
|
Result.push_back(I);
|
|
|
|
return Result;
|
|
|
|
}
|
2018-05-17 18:52:18 +08:00
|
|
|
// Resolve opcode name -> opcode.
|
2018-10-17 23:04:15 +08:00
|
|
|
const auto ResolveName =
|
|
|
|
[&MCInstrInfo](llvm::StringRef OpcodeName) -> unsigned {
|
|
|
|
for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
|
|
|
|
if (MCInstrInfo.getName(I) == OpcodeName)
|
|
|
|
return I;
|
|
|
|
return 0u;
|
|
|
|
};
|
|
|
|
llvm::SmallVector<llvm::StringRef, 2> Pieces;
|
|
|
|
llvm::StringRef(OpcodeNames.getValue())
|
|
|
|
.split(Pieces, ",", /* MaxSplit */ -1, /* KeepEmpty */ false);
|
|
|
|
std::vector<unsigned> Result;
|
|
|
|
for (const llvm::StringRef OpcodeName : Pieces) {
|
|
|
|
if (unsigned Opcode = ResolveName(OpcodeName))
|
|
|
|
Result.push_back(Opcode);
|
|
|
|
else
|
|
|
|
llvm::report_fatal_error(
|
|
|
|
llvm::Twine("unknown opcode ").concat(OpcodeName));
|
|
|
|
}
|
|
|
|
return Result;
|
2018-05-17 18:52:18 +08:00
|
|
|
}
|
|
|
|
|
2018-09-13 15:40:53 +08:00
|
|
|
// Generates code snippets for opcode `Opcode`.
|
2018-09-25 15:31:44 +08:00
|
|
|
static llvm::Expected<std::vector<BenchmarkCode>>
|
2018-09-13 16:06:29 +08:00
|
|
|
generateSnippets(const LLVMState &State, unsigned Opcode) {
|
2018-10-24 19:55:06 +08:00
|
|
|
const Instruction &Instr = State.getIC().getInstr(Opcode);
|
2018-10-10 22:57:32 +08:00
|
|
|
const llvm::MCInstrDesc &InstrDesc = *Instr.Description;
|
2018-09-13 15:40:53 +08:00
|
|
|
// Ignore instructions that we cannot run.
|
|
|
|
if (InstrDesc.isPseudo())
|
|
|
|
return llvm::make_error<BenchmarkFailure>("Unsupported opcode: isPseudo");
|
|
|
|
if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch())
|
|
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
|
|
"Unsupported opcode: isBranch/isIndirectBranch");
|
|
|
|
if (InstrDesc.isCall() || InstrDesc.isReturn())
|
|
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
|
|
"Unsupported opcode: isCall/isReturn");
|
|
|
|
|
2018-10-10 22:57:32 +08:00
|
|
|
const std::unique_ptr<SnippetGenerator> Generator =
|
|
|
|
State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State);
|
|
|
|
if (!Generator)
|
|
|
|
llvm::report_fatal_error("cannot create snippet generator");
|
|
|
|
return Generator->generateConfigurations(Instr);
|
2018-09-13 15:40:53 +08:00
|
|
|
}
|
|
|
|
|
2018-09-25 15:31:44 +08:00
|
|
|
namespace {
|
|
|
|
|
|
|
|
// An MCStreamer that reads a BenchmarkCode definition from a file.
|
|
|
|
// The BenchmarkCode definition is just an asm file, with additional comments to
|
|
|
|
// specify which registers should be defined or are live on entry.
|
|
|
|
class BenchmarkCodeStreamer : public llvm::MCStreamer,
|
|
|
|
public llvm::AsmCommentConsumer {
|
|
|
|
public:
|
|
|
|
explicit BenchmarkCodeStreamer(llvm::MCContext *Context,
|
|
|
|
const llvm::MCRegisterInfo *TheRegInfo,
|
|
|
|
BenchmarkCode *Result)
|
|
|
|
: llvm::MCStreamer(*Context), RegInfo(TheRegInfo), Result(Result) {}
|
|
|
|
|
|
|
|
// Implementation of the llvm::MCStreamer interface. We only care about
|
|
|
|
// instructions.
|
2018-09-27 14:10:15 +08:00
|
|
|
void EmitInstruction(const llvm::MCInst &Instruction,
|
2019-02-04 20:51:26 +08:00
|
|
|
const llvm::MCSubtargetInfo &STI) override {
|
2018-09-27 14:10:15 +08:00
|
|
|
Result->Instructions.push_back(Instruction);
|
2018-09-25 15:31:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Implementation of the llvm::AsmCommentConsumer.
|
|
|
|
void HandleComment(llvm::SMLoc Loc, llvm::StringRef CommentText) override {
|
|
|
|
CommentText = CommentText.trim();
|
|
|
|
if (!CommentText.consume_front("LLVM-EXEGESIS-"))
|
|
|
|
return;
|
|
|
|
if (CommentText.consume_front("DEFREG")) {
|
|
|
|
// LLVM-EXEGESIS-DEFREF <reg> <hex_value>
|
2018-11-08 20:09:45 +08:00
|
|
|
RegisterValue RegVal;
|
2018-09-25 15:31:44 +08:00
|
|
|
llvm::SmallVector<llvm::StringRef, 2> Parts;
|
|
|
|
CommentText.split(Parts, ' ', /*unlimited splits*/ -1,
|
|
|
|
/*do not keep empty strings*/ false);
|
|
|
|
if (Parts.size() != 2) {
|
|
|
|
llvm::errs() << "invalid comment 'LLVM-EXEGESIS-DEFREG " << CommentText
|
|
|
|
<< "\n";
|
|
|
|
++InvalidComments;
|
|
|
|
}
|
|
|
|
if (!(RegVal.Register = findRegisterByName(Parts[0].trim()))) {
|
|
|
|
llvm::errs() << "unknown register in 'LLVM-EXEGESIS-DEFREG "
|
|
|
|
<< CommentText << "\n";
|
|
|
|
++InvalidComments;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
const llvm::StringRef HexValue = Parts[1].trim();
|
|
|
|
RegVal.Value = llvm::APInt(
|
|
|
|
/* each hex digit is 4 bits */ HexValue.size() * 4, HexValue, 16);
|
|
|
|
Result->RegisterInitialValues.push_back(std::move(RegVal));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (CommentText.consume_front("LIVEIN")) {
|
|
|
|
// LLVM-EXEGESIS-LIVEIN <reg>
|
|
|
|
if (unsigned Reg = findRegisterByName(CommentText.ltrim()))
|
|
|
|
Result->LiveIns.push_back(Reg);
|
|
|
|
else {
|
|
|
|
llvm::errs() << "unknown register in 'LLVM-EXEGESIS-LIVEIN "
|
|
|
|
<< CommentText << "\n";
|
|
|
|
++InvalidComments;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned numInvalidComments() const { return InvalidComments; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
// We only care about instructions, we don't implement this part of the API.
|
2018-09-27 14:10:15 +08:00
|
|
|
void EmitCommonSymbol(llvm::MCSymbol *Symbol, uint64_t Size,
|
|
|
|
unsigned ByteAlignment) override {}
|
|
|
|
bool EmitSymbolAttribute(llvm::MCSymbol *Symbol,
|
|
|
|
llvm::MCSymbolAttr Attribute) override {
|
2018-09-25 15:31:44 +08:00
|
|
|
return false;
|
|
|
|
}
|
2018-09-27 14:10:15 +08:00
|
|
|
void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
|
|
|
|
unsigned ValueSize,
|
|
|
|
unsigned MaxBytesToEmit) override {}
|
|
|
|
void EmitZerofill(llvm::MCSection *Section, llvm::MCSymbol *Symbol,
|
|
|
|
uint64_t Size, unsigned ByteAlignment,
|
2018-09-25 15:31:44 +08:00
|
|
|
llvm::SMLoc Loc) override {}
|
|
|
|
|
|
|
|
unsigned findRegisterByName(const llvm::StringRef RegName) const {
|
|
|
|
// FIXME: Can we do better than this ?
|
|
|
|
for (unsigned I = 0, E = RegInfo->getNumRegs(); I < E; ++I) {
|
|
|
|
if (RegName == RegInfo->getName(I))
|
|
|
|
return I;
|
|
|
|
}
|
|
|
|
llvm::errs() << "'" << RegName
|
|
|
|
<< "' is not a valid register name for the target\n";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
const llvm::MCRegisterInfo *const RegInfo;
|
|
|
|
BenchmarkCode *const Result;
|
|
|
|
unsigned InvalidComments = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Reads code snippets from file `Filename`.
|
|
|
|
static llvm::Expected<std::vector<BenchmarkCode>>
|
|
|
|
readSnippets(const LLVMState &State, llvm::StringRef Filename) {
|
|
|
|
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> BufferPtr =
|
|
|
|
llvm::MemoryBuffer::getFileOrSTDIN(Filename);
|
|
|
|
if (std::error_code EC = BufferPtr.getError()) {
|
|
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
|
|
"cannot read snippet: " + Filename + ": " + EC.message());
|
|
|
|
}
|
|
|
|
llvm::SourceMgr SM;
|
|
|
|
SM.AddNewSourceBuffer(std::move(BufferPtr.get()), llvm::SMLoc());
|
|
|
|
|
|
|
|
BenchmarkCode Result;
|
|
|
|
|
|
|
|
llvm::MCObjectFileInfo ObjectFileInfo;
|
|
|
|
const llvm::TargetMachine &TM = State.getTargetMachine();
|
|
|
|
llvm::MCContext Context(TM.getMCAsmInfo(), TM.getMCRegisterInfo(),
|
|
|
|
&ObjectFileInfo);
|
|
|
|
ObjectFileInfo.InitMCObjectFileInfo(TM.getTargetTriple(), /*PIC*/ false,
|
|
|
|
Context);
|
|
|
|
BenchmarkCodeStreamer Streamer(&Context, TM.getMCRegisterInfo(), &Result);
|
|
|
|
const std::unique_ptr<llvm::MCAsmParser> AsmParser(
|
|
|
|
llvm::createMCAsmParser(SM, Context, Streamer, *TM.getMCAsmInfo()));
|
|
|
|
if (!AsmParser)
|
|
|
|
return llvm::make_error<BenchmarkFailure>("cannot create asm parser");
|
|
|
|
AsmParser->getLexer().setCommentConsumer(&Streamer);
|
|
|
|
|
|
|
|
const std::unique_ptr<llvm::MCTargetAsmParser> TargetAsmParser(
|
|
|
|
TM.getTarget().createMCAsmParser(*TM.getMCSubtargetInfo(), *AsmParser,
|
|
|
|
*TM.getMCInstrInfo(),
|
|
|
|
llvm::MCTargetOptions()));
|
|
|
|
|
|
|
|
if (!TargetAsmParser)
|
|
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
|
|
"cannot create target asm parser");
|
|
|
|
AsmParser->setTargetParser(*TargetAsmParser);
|
|
|
|
|
|
|
|
if (AsmParser->Run(false))
|
|
|
|
return llvm::make_error<BenchmarkFailure>("cannot parse asm file");
|
|
|
|
if (Streamer.numInvalidComments())
|
|
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
|
|
llvm::Twine("found ")
|
|
|
|
.concat(llvm::Twine(Streamer.numInvalidComments()))
|
|
|
|
.concat(" invalid LLVM-EXEGESIS comments"));
|
|
|
|
return std::vector<BenchmarkCode>{std::move(Result)};
|
|
|
|
}
|
|
|
|
|
2018-05-15 20:08:00 +08:00
|
|
|
void benchmarkMain() {
|
|
|
|
if (exegesis::pfm::pfmInitialize())
|
|
|
|
llvm::report_fatal_error("cannot initialize libpfm");
|
|
|
|
|
2018-04-04 19:37:06 +08:00
|
|
|
llvm::InitializeNativeTarget();
|
|
|
|
llvm::InitializeNativeTargetAsmPrinter();
|
2018-09-25 15:31:44 +08:00
|
|
|
llvm::InitializeNativeTargetAsmParser();
|
2018-06-19 19:28:59 +08:00
|
|
|
#ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET
|
|
|
|
LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
|
|
|
|
#endif
|
2018-04-04 19:37:06 +08:00
|
|
|
|
2018-10-25 15:44:01 +08:00
|
|
|
const LLVMState State(CpuName);
|
2018-10-17 23:04:15 +08:00
|
|
|
const auto Opcodes = getOpcodesOrDie(State.getInstrInfo());
|
2018-04-04 19:37:06 +08:00
|
|
|
|
2018-09-25 15:31:44 +08:00
|
|
|
std::vector<BenchmarkCode> Configurations;
|
2018-10-17 23:04:15 +08:00
|
|
|
if (!Opcodes.empty()) {
|
|
|
|
for (const unsigned Opcode : Opcodes) {
|
|
|
|
// Ignore instructions without a sched class if
|
|
|
|
// -ignore-invalid-sched-class is passed.
|
|
|
|
if (IgnoreInvalidSchedClass &&
|
|
|
|
State.getInstrInfo().get(Opcode).getSchedClass() == 0) {
|
|
|
|
llvm::errs() << State.getInstrInfo().getName(Opcode)
|
|
|
|
<< ": ignoring instruction without sched class\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
auto ConfigsForInstr = generateSnippets(State, Opcode);
|
|
|
|
if (!ConfigsForInstr) {
|
|
|
|
llvm::logAllUnhandledErrors(
|
|
|
|
ConfigsForInstr.takeError(), llvm::errs(),
|
|
|
|
llvm::Twine(State.getInstrInfo().getName(Opcode)).concat(": "));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
|
|
|
|
std::back_inserter(Configurations));
|
2018-09-25 15:31:44 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
|
|
|
|
}
|
2018-09-13 15:40:53 +08:00
|
|
|
|
2018-06-26 16:49:30 +08:00
|
|
|
const std::unique_ptr<BenchmarkRunner> Runner =
|
|
|
|
State.getExegesisTarget().createBenchmarkRunner(BenchmarkMode, State);
|
|
|
|
if (!Runner) {
|
|
|
|
llvm::report_fatal_error("cannot create benchmark runner");
|
2018-04-04 19:37:06 +08:00
|
|
|
}
|
|
|
|
|
2018-05-17 18:52:18 +08:00
|
|
|
if (NumRepetitions == 0)
|
|
|
|
llvm::report_fatal_error("--num-repetitions must be greater than zero");
|
|
|
|
|
2018-06-07 15:51:16 +08:00
|
|
|
// Write to standard output if file is not set.
|
|
|
|
if (BenchmarkFile.empty())
|
|
|
|
BenchmarkFile = "-";
|
|
|
|
|
2018-09-13 15:40:53 +08:00
|
|
|
for (const BenchmarkCode &Conf : Configurations) {
|
|
|
|
InstructionBenchmark Result =
|
|
|
|
Runner->runConfiguration(Conf, NumRepetitions);
|
2018-09-25 20:18:08 +08:00
|
|
|
ExitOnErr(Result.writeYaml(State, BenchmarkFile));
|
2018-09-13 15:40:53 +08:00
|
|
|
}
|
2018-05-15 20:08:00 +08:00
|
|
|
exegesis::pfm::pfmTerminate();
|
|
|
|
}
|
|
|
|
|
2018-05-17 21:41:28 +08:00
|
|
|
// Prints the results of running analysis pass `Pass` to file `OutputFilename`
|
|
|
|
// if OutputFilename is non-empty.
|
|
|
|
template <typename Pass>
|
|
|
|
static void maybeRunAnalysis(const Analysis &Analyzer, const std::string &Name,
|
2018-06-05 18:56:19 +08:00
|
|
|
const std::string &OutputFilename) {
|
2018-05-17 21:41:28 +08:00
|
|
|
if (OutputFilename.empty())
|
|
|
|
return;
|
|
|
|
if (OutputFilename != "-") {
|
|
|
|
llvm::errs() << "Printing " << Name << " results to file '"
|
|
|
|
<< OutputFilename << "'\n";
|
|
|
|
}
|
|
|
|
std::error_code ErrorCode;
|
|
|
|
llvm::raw_fd_ostream ClustersOS(OutputFilename, ErrorCode,
|
2018-06-08 03:58:58 +08:00
|
|
|
llvm::sys::fs::FA_Read |
|
|
|
|
llvm::sys::fs::FA_Write);
|
|
|
|
if (ErrorCode)
|
|
|
|
llvm::report_fatal_error("cannot open out file: " + OutputFilename);
|
|
|
|
if (auto Err = Analyzer.run<Pass>(ClustersOS))
|
|
|
|
llvm::report_fatal_error(std::move(Err));
|
2018-05-17 21:41:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void analysisMain() {
|
2018-06-07 15:51:16 +08:00
|
|
|
if (BenchmarkFile.empty())
|
|
|
|
llvm::report_fatal_error("--benchmarks-file must be set.");
|
|
|
|
|
2019-02-04 17:12:08 +08:00
|
|
|
if (AnalysisClustersOutputFile.empty() &&
|
|
|
|
AnalysisInconsistenciesOutputFile.empty()) {
|
|
|
|
llvm::report_fatal_error(
|
|
|
|
"At least one of --analysis-clusters-output-file and "
|
|
|
|
"--analysis-inconsistencies-output-file must be specified.");
|
|
|
|
}
|
|
|
|
|
2018-06-05 18:56:19 +08:00
|
|
|
llvm::InitializeNativeTarget();
|
|
|
|
llvm::InitializeNativeTargetAsmPrinter();
|
2018-06-15 15:30:45 +08:00
|
|
|
llvm::InitializeNativeTargetDisassembler();
|
2018-05-15 20:08:00 +08:00
|
|
|
// Read benchmarks.
|
2018-10-25 15:44:01 +08:00
|
|
|
const LLVMState State("");
|
2018-05-15 20:08:00 +08:00
|
|
|
const std::vector<InstructionBenchmark> Points =
|
2018-09-25 20:18:08 +08:00
|
|
|
ExitOnErr(InstructionBenchmark::readYamls(State, BenchmarkFile));
|
2018-05-15 20:08:00 +08:00
|
|
|
llvm::outs() << "Parsed " << Points.size() << " benchmark points\n";
|
|
|
|
if (Points.empty()) {
|
|
|
|
llvm::errs() << "no benchmarks to analyze\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// FIXME: Check that all points have the same triple/cpu.
|
|
|
|
// FIXME: Merge points from several runs (latency and uops).
|
|
|
|
|
|
|
|
std::string Error;
|
|
|
|
const auto *TheTarget =
|
|
|
|
llvm::TargetRegistry::lookupTarget(Points[0].LLVMTriple, Error);
|
|
|
|
if (!TheTarget) {
|
|
|
|
llvm::errs() << "unknown target '" << Points[0].LLVMTriple << "'\n";
|
|
|
|
return;
|
|
|
|
}
|
[llvm-exegesis] Opcode stabilization / reclusterization (PR40715)
Summary:
Given an instruction `Opcode`, we can make benchmarks (measurements) of the
instruction characteristics/performance. Then, to facilitate further analysis
we group the benchmarks with *similar* characteristics into clusters.
Now, this is all not entirely deterministic. Some instructions have variable
characteristics, depending on their arguments. And thus, if we do several
benchmarks of the same instruction `Opcode`, we may end up with *different*
performance characteristics measurements. And when we then do clustering,
these several benchmarks of the same instruction `Opcode` may end up being
clustered into *different* clusters. This is not great for further analysis.
We shall find every `Opcode` with benchmarks not in just one cluster, and move
*all* the benchmarks of said `Opcode` into one new unstable cluster per `Opcode`.
I have solved this by making `ClusterId` a bit field, adding a `IsUnstable` bit,
and introducing `-analysis-display-unstable-clusters` switch to toggle between
displaying stable-only clusters and unstable-only clusters.
The reclusterization is deterministically stable, produces identical reports
between runs. (Or at least that is what i'm seeing, maybe it isn't)
Timings/comparisons:
old (current trunk/head) {F8303582}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
6624.73 msec task-clock # 0.999 CPUs utilized ( +- 0.53% )
172 context-switches # 25.965 M/sec ( +- 29.89% )
0 cpu-migrations # 0.042 M/sec ( +- 56.54% )
31073 page-faults # 4690.754 M/sec ( +- 0.08% )
26538711696 cycles # 4006230.292 GHz ( +- 0.53% ) (83.31%)
2017496807 stalled-cycles-frontend # 7.60% frontend cycles idle ( +- 0.93% ) (83.32%)
13403650062 stalled-cycles-backend # 50.51% backend cycles idle ( +- 0.33% ) (33.37%)
19770706799 instructions # 0.74 insn per cycle
# 0.68 stalled cycles per insn ( +- 0.04% ) (50.04%)
4419821812 branches # 667207369.714 M/sec ( +- 0.03% ) (66.69%)
121741669 branch-misses # 2.75% of all branches ( +- 0.28% ) (83.34%)
6.6283 +- 0.0358 seconds time elapsed ( +- 0.54% )
```
patch, with reclustering but without filtering (i.e. outputting all the stable *and* unstable clusters) {F8303586}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html' (25 runs):
6475.29 msec task-clock # 0.999 CPUs utilized ( +- 0.31% )
213 context-switches # 32.952 M/sec ( +- 23.81% )
1 cpu-migrations # 0.130 M/sec ( +- 43.84% )
31287 page-faults # 4832.057 M/sec ( +- 0.08% )
25939086577 cycles # 4006160.279 GHz ( +- 0.31% ) (83.31%)
1958812858 stalled-cycles-frontend # 7.55% frontend cycles idle ( +- 0.68% ) (83.32%)
13218961512 stalled-cycles-backend # 50.96% backend cycles idle ( +- 0.29% ) (33.37%)
19752995402 instructions # 0.76 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.04%)
4417079244 branches # 682195472.305 M/sec ( +- 0.03% ) (66.70%)
121510065 branch-misses # 2.75% of all branches ( +- 0.19% ) (83.34%)
6.4832 +- 0.0229 seconds time elapsed ( +- 0.35% )
```
Funnily, *this* measurement shows that said reclustering actually improved performance.
patch, with reclustering, only the stable clusters {F8303594}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html' (25 runs):
6387.71 msec task-clock # 0.999 CPUs utilized ( +- 0.13% )
133 context-switches # 20.792 M/sec ( +- 23.39% )
0 cpu-migrations # 0.063 M/sec ( +- 61.24% )
31318 page-faults # 4903.256 M/sec ( +- 0.08% )
25591984967 cycles # 4006786.266 GHz ( +- 0.13% ) (83.31%)
1881234904 stalled-cycles-frontend # 7.35% frontend cycles idle ( +- 0.25% ) (83.33%)
13209749965 stalled-cycles-backend # 51.62% backend cycles idle ( +- 0.16% ) (33.36%)
19767554347 instructions # 0.77 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.03%)
4417480305 branches # 691618858.046 M/sec ( +- 0.03% ) (66.68%)
118676358 branch-misses # 2.69% of all branches ( +- 0.07% ) (83.33%)
6.3954 +- 0.0118 seconds time elapsed ( +- 0.18% )
```
Performance improved even further?! Makes sense i guess, less clusters to print.
patch, with reclustering, only the unstable clusters {F8303601}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters' (25 runs):
6124.96 msec task-clock # 1.000 CPUs utilized ( +- 0.20% )
194 context-switches # 31.709 M/sec ( +- 20.46% )
0 cpu-migrations # 0.039 M/sec ( +- 49.77% )
31413 page-faults # 5129.261 M/sec ( +- 0.06% )
24536794267 cycles # 4006425.858 GHz ( +- 0.19% ) (83.31%)
1676085087 stalled-cycles-frontend # 6.83% frontend cycles idle ( +- 0.46% ) (83.32%)
13035595603 stalled-cycles-backend # 53.13% backend cycles idle ( +- 0.16% ) (33.36%)
18260877653 instructions # 0.74 insn per cycle
# 0.71 stalled cycles per insn ( +- 0.05% ) (50.03%)
4112411983 branches # 671484364.603 M/sec ( +- 0.03% ) (66.68%)
114066929 branch-misses # 2.77% of all branches ( +- 0.11% ) (83.32%)
6.1278 +- 0.0121 seconds time elapsed ( +- 0.20% )
```
This tells us that the actual `-analysis-inconsistencies-output-file=` outputting only takes ~0.4 sec for 43970 benchmark points (3 whole sweeps)
(Also, wow this is fast, it used to take several minutes originally)
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40715 | PR40715 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, jdoerfert, llvm-commits, RKSimon
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58355
llvm-svn: 354441
2019-02-20 17:14:04 +08:00
|
|
|
|
|
|
|
std::unique_ptr<llvm::MCInstrInfo> InstrInfo(TheTarget->createMCInstrInfo());
|
|
|
|
|
2018-06-11 17:18:01 +08:00
|
|
|
const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create(
|
[llvm-exegesis] Split Epsilon param into two (PR40787)
Summary:
This eps param is used for two distinct things:
* initial point clusterization
* checking clusters against the llvm values
What if one wants to only look at highly different clusters, without changing
the clustering itself? In particular, this helps to weed out noisy measurements
(since the clusterization epsilon is still small, so there is a better chance
that noisy measurements from the same opcode will go into different clusters)
By splitting it into two params it is now possible.
This is nearly-free performance-wise:
Old:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
390.01 msec task-clock # 0.998 CPUs utilized ( +- 0.25% )
12 context-switches # 31.735 M/sec ( +- 27.38% )
0 cpu-migrations # 0.000 K/sec
4745 page-faults # 12183.732 M/sec ( +- 0.54% )
1562711900 cycles # 4012303.327 GHz ( +- 0.24% ) (82.90%)
185567822 stalled-cycles-frontend # 11.87% frontend cycles idle ( +- 0.52% ) (83.30%)
392106234 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.31% ) (33.79%)
1839236666 instructions # 1.18 insn per cycle
# 0.21 stalled cycles per insn ( +- 0.15% ) (50.37%)
407035764 branches # 1045074878.710 M/sec ( +- 0.12% ) (66.80%)
10896459 branch-misses # 2.68% of all branches ( +- 0.17% ) (83.20%)
0.390629 +- 0.000972 seconds time elapsed ( +- 0.25% )
```
```
$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (9 runs):
6803.36 msec task-clock # 0.999 CPUs utilized ( +- 0.96% )
262 context-switches # 38.546 M/sec ( +- 23.06% )
0 cpu-migrations # 0.065 M/sec ( +- 76.03% )
13287 page-faults # 1953.206 M/sec ( +- 0.32% )
27252537904 cycles # 4006024.257 GHz ( +- 0.95% ) (83.31%)
1496314935 stalled-cycles-frontend # 5.49% frontend cycles idle ( +- 0.97% ) (83.32%)
16128404524 stalled-cycles-backend # 59.18% backend cycles idle ( +- 0.30% ) (33.37%)
17611143370 instructions # 0.65 insn per cycle
# 0.92 stalled cycles per insn ( +- 0.05% ) (50.04%)
3894906599 branches # 572537147.437 M/sec ( +- 0.03% ) (66.69%)
116314514 branch-misses # 2.99% of all branches ( +- 0.20% ) (83.35%)
6.8118 +- 0.0689 seconds time elapsed ( +- 1.01%)
```
New:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (25 runs):
400.14 msec task-clock # 0.998 CPUs utilized ( +- 0.66% )
12 context-switches # 29.429 M/sec ( +- 25.95% )
0 cpu-migrations # 0.100 M/sec ( +-100.00% )
4714 page-faults # 11796.496 M/sec ( +- 0.55% )
1603131306 cycles # 4011840.105 GHz ( +- 0.66% ) (82.85%)
199538509 stalled-cycles-frontend # 12.45% frontend cycles idle ( +- 2.40% ) (83.10%)
402249109 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.19% ) (34.05%)
1847783963 instructions # 1.15 insn per cycle
# 0.22 stalled cycles per insn ( +- 0.18% ) (50.64%)
407162722 branches # 1018925730.631 M/sec ( +- 0.12% ) (67.02%)
10932779 branch-misses # 2.69% of all branches ( +- 0.51% ) (83.28%)
0.40077 +- 0.00267 seconds time elapsed ( +- 0.67% )
lebedevri@pini-pini:/build/llvm-build-Clang-release$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (9 runs):
6947.79 msec task-clock # 1.000 CPUs utilized ( +- 0.90% )
217 context-switches # 31.236 M/sec ( +- 36.16% )
1 cpu-migrations # 0.096 M/sec ( +- 50.00% )
13258 page-faults # 1908.389 M/sec ( +- 0.34% )
27830796523 cycles # 4006032.286 GHz ( +- 0.89% ) (83.30%)
1504554006 stalled-cycles-frontend # 5.41% frontend cycles idle ( +- 2.10% ) (83.32%)
16716574843 stalled-cycles-backend # 60.07% backend cycles idle ( +- 0.65% ) (33.38%)
17755545931 instructions # 0.64 insn per cycle
# 0.94 stalled cycles per insn ( +- 0.09% ) (50.04%)
3897255686 branches # 560980426.597 M/sec ( +- 0.06% ) (66.70%)
117045395 branch-misses # 3.00% of all branches ( +- 0.47% ) (83.34%)
6.9507 +- 0.0627 seconds time elapsed ( +- 0.90% )
```
I.e. it's +2.6% slowdown for one whole sweep, or +2% for 5 whole sweeps.
Within noise i'd say.
Should help with [[ https://bugs.llvm.org/show_bug.cgi?id=40787 | PR40787 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, RKSimon, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58476
llvm-svn: 354767
2019-02-25 17:36:12 +08:00
|
|
|
Points, AnalysisNumPoints, AnalysisClusteringEpsilon,
|
|
|
|
InstrInfo->getNumOpcodes()));
|
2018-05-16 16:47:21 +08:00
|
|
|
|
[llvm-exegesis] Opcode stabilization / reclusterization (PR40715)
Summary:
Given an instruction `Opcode`, we can make benchmarks (measurements) of the
instruction characteristics/performance. Then, to facilitate further analysis
we group the benchmarks with *similar* characteristics into clusters.
Now, this is all not entirely deterministic. Some instructions have variable
characteristics, depending on their arguments. And thus, if we do several
benchmarks of the same instruction `Opcode`, we may end up with *different*
performance characteristics measurements. And when we then do clustering,
these several benchmarks of the same instruction `Opcode` may end up being
clustered into *different* clusters. This is not great for further analysis.
We shall find every `Opcode` with benchmarks not in just one cluster, and move
*all* the benchmarks of said `Opcode` into one new unstable cluster per `Opcode`.
I have solved this by making `ClusterId` a bit field, adding a `IsUnstable` bit,
and introducing `-analysis-display-unstable-clusters` switch to toggle between
displaying stable-only clusters and unstable-only clusters.
The reclusterization is deterministically stable, produces identical reports
between runs. (Or at least that is what i'm seeing, maybe it isn't)
Timings/comparisons:
old (current trunk/head) {F8303582}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
6624.73 msec task-clock # 0.999 CPUs utilized ( +- 0.53% )
172 context-switches # 25.965 M/sec ( +- 29.89% )
0 cpu-migrations # 0.042 M/sec ( +- 56.54% )
31073 page-faults # 4690.754 M/sec ( +- 0.08% )
26538711696 cycles # 4006230.292 GHz ( +- 0.53% ) (83.31%)
2017496807 stalled-cycles-frontend # 7.60% frontend cycles idle ( +- 0.93% ) (83.32%)
13403650062 stalled-cycles-backend # 50.51% backend cycles idle ( +- 0.33% ) (33.37%)
19770706799 instructions # 0.74 insn per cycle
# 0.68 stalled cycles per insn ( +- 0.04% ) (50.04%)
4419821812 branches # 667207369.714 M/sec ( +- 0.03% ) (66.69%)
121741669 branch-misses # 2.75% of all branches ( +- 0.28% ) (83.34%)
6.6283 +- 0.0358 seconds time elapsed ( +- 0.54% )
```
patch, with reclustering but without filtering (i.e. outputting all the stable *and* unstable clusters) {F8303586}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html' (25 runs):
6475.29 msec task-clock # 0.999 CPUs utilized ( +- 0.31% )
213 context-switches # 32.952 M/sec ( +- 23.81% )
1 cpu-migrations # 0.130 M/sec ( +- 43.84% )
31287 page-faults # 4832.057 M/sec ( +- 0.08% )
25939086577 cycles # 4006160.279 GHz ( +- 0.31% ) (83.31%)
1958812858 stalled-cycles-frontend # 7.55% frontend cycles idle ( +- 0.68% ) (83.32%)
13218961512 stalled-cycles-backend # 50.96% backend cycles idle ( +- 0.29% ) (33.37%)
19752995402 instructions # 0.76 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.04%)
4417079244 branches # 682195472.305 M/sec ( +- 0.03% ) (66.70%)
121510065 branch-misses # 2.75% of all branches ( +- 0.19% ) (83.34%)
6.4832 +- 0.0229 seconds time elapsed ( +- 0.35% )
```
Funnily, *this* measurement shows that said reclustering actually improved performance.
patch, with reclustering, only the stable clusters {F8303594}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html' (25 runs):
6387.71 msec task-clock # 0.999 CPUs utilized ( +- 0.13% )
133 context-switches # 20.792 M/sec ( +- 23.39% )
0 cpu-migrations # 0.063 M/sec ( +- 61.24% )
31318 page-faults # 4903.256 M/sec ( +- 0.08% )
25591984967 cycles # 4006786.266 GHz ( +- 0.13% ) (83.31%)
1881234904 stalled-cycles-frontend # 7.35% frontend cycles idle ( +- 0.25% ) (83.33%)
13209749965 stalled-cycles-backend # 51.62% backend cycles idle ( +- 0.16% ) (33.36%)
19767554347 instructions # 0.77 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.03%)
4417480305 branches # 691618858.046 M/sec ( +- 0.03% ) (66.68%)
118676358 branch-misses # 2.69% of all branches ( +- 0.07% ) (83.33%)
6.3954 +- 0.0118 seconds time elapsed ( +- 0.18% )
```
Performance improved even further?! Makes sense i guess, less clusters to print.
patch, with reclustering, only the unstable clusters {F8303601}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters' (25 runs):
6124.96 msec task-clock # 1.000 CPUs utilized ( +- 0.20% )
194 context-switches # 31.709 M/sec ( +- 20.46% )
0 cpu-migrations # 0.039 M/sec ( +- 49.77% )
31413 page-faults # 5129.261 M/sec ( +- 0.06% )
24536794267 cycles # 4006425.858 GHz ( +- 0.19% ) (83.31%)
1676085087 stalled-cycles-frontend # 6.83% frontend cycles idle ( +- 0.46% ) (83.32%)
13035595603 stalled-cycles-backend # 53.13% backend cycles idle ( +- 0.16% ) (33.36%)
18260877653 instructions # 0.74 insn per cycle
# 0.71 stalled cycles per insn ( +- 0.05% ) (50.03%)
4112411983 branches # 671484364.603 M/sec ( +- 0.03% ) (66.68%)
114066929 branch-misses # 2.77% of all branches ( +- 0.11% ) (83.32%)
6.1278 +- 0.0121 seconds time elapsed ( +- 0.20% )
```
This tells us that the actual `-analysis-inconsistencies-output-file=` outputting only takes ~0.4 sec for 43970 benchmark points (3 whole sweeps)
(Also, wow this is fast, it used to take several minutes originally)
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40715 | PR40715 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, jdoerfert, llvm-commits, RKSimon
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58355
llvm-svn: 354441
2019-02-20 17:14:04 +08:00
|
|
|
const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering,
|
[llvm-exegesis] Split Epsilon param into two (PR40787)
Summary:
This eps param is used for two distinct things:
* initial point clusterization
* checking clusters against the llvm values
What if one wants to only look at highly different clusters, without changing
the clustering itself? In particular, this helps to weed out noisy measurements
(since the clusterization epsilon is still small, so there is a better chance
that noisy measurements from the same opcode will go into different clusters)
By splitting it into two params it is now possible.
This is nearly-free performance-wise:
Old:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
390.01 msec task-clock # 0.998 CPUs utilized ( +- 0.25% )
12 context-switches # 31.735 M/sec ( +- 27.38% )
0 cpu-migrations # 0.000 K/sec
4745 page-faults # 12183.732 M/sec ( +- 0.54% )
1562711900 cycles # 4012303.327 GHz ( +- 0.24% ) (82.90%)
185567822 stalled-cycles-frontend # 11.87% frontend cycles idle ( +- 0.52% ) (83.30%)
392106234 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.31% ) (33.79%)
1839236666 instructions # 1.18 insn per cycle
# 0.21 stalled cycles per insn ( +- 0.15% ) (50.37%)
407035764 branches # 1045074878.710 M/sec ( +- 0.12% ) (66.80%)
10896459 branch-misses # 2.68% of all branches ( +- 0.17% ) (83.20%)
0.390629 +- 0.000972 seconds time elapsed ( +- 0.25% )
```
```
$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (9 runs):
6803.36 msec task-clock # 0.999 CPUs utilized ( +- 0.96% )
262 context-switches # 38.546 M/sec ( +- 23.06% )
0 cpu-migrations # 0.065 M/sec ( +- 76.03% )
13287 page-faults # 1953.206 M/sec ( +- 0.32% )
27252537904 cycles # 4006024.257 GHz ( +- 0.95% ) (83.31%)
1496314935 stalled-cycles-frontend # 5.49% frontend cycles idle ( +- 0.97% ) (83.32%)
16128404524 stalled-cycles-backend # 59.18% backend cycles idle ( +- 0.30% ) (33.37%)
17611143370 instructions # 0.65 insn per cycle
# 0.92 stalled cycles per insn ( +- 0.05% ) (50.04%)
3894906599 branches # 572537147.437 M/sec ( +- 0.03% ) (66.69%)
116314514 branch-misses # 2.99% of all branches ( +- 0.20% ) (83.35%)
6.8118 +- 0.0689 seconds time elapsed ( +- 1.01%)
```
New:
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 10099 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency-1.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (25 runs):
400.14 msec task-clock # 0.998 CPUs utilized ( +- 0.66% )
12 context-switches # 29.429 M/sec ( +- 25.95% )
0 cpu-migrations # 0.100 M/sec ( +-100.00% )
4714 page-faults # 11796.496 M/sec ( +- 0.55% )
1603131306 cycles # 4011840.105 GHz ( +- 0.66% ) (82.85%)
199538509 stalled-cycles-frontend # 12.45% frontend cycles idle ( +- 2.40% ) (83.10%)
402249109 stalled-cycles-backend # 25.09% backend cycles idle ( +- 1.19% ) (34.05%)
1847783963 instructions # 1.15 insn per cycle
# 0.22 stalled cycles per insn ( +- 0.18% ) (50.64%)
407162722 branches # 1018925730.631 M/sec ( +- 0.12% ) (67.02%)
10932779 branch-misses # 2.69% of all branches ( +- 0.51% ) (83.28%)
0.40077 +- 0.00267 seconds time elapsed ( +- 0.67% )
lebedevri@pini-pini:/build/llvm-build-Clang-release$ perf stat -r 9 ./bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 50572 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new.html'
...
Performance counter stats for './bin/llvm-exegesis -mode=analysis -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-latency.yml -analysis-inconsistencies-output-file=/tmp/clusters-new.html' (9 runs):
6947.79 msec task-clock # 1.000 CPUs utilized ( +- 0.90% )
217 context-switches # 31.236 M/sec ( +- 36.16% )
1 cpu-migrations # 0.096 M/sec ( +- 50.00% )
13258 page-faults # 1908.389 M/sec ( +- 0.34% )
27830796523 cycles # 4006032.286 GHz ( +- 0.89% ) (83.30%)
1504554006 stalled-cycles-frontend # 5.41% frontend cycles idle ( +- 2.10% ) (83.32%)
16716574843 stalled-cycles-backend # 60.07% backend cycles idle ( +- 0.65% ) (33.38%)
17755545931 instructions # 0.64 insn per cycle
# 0.94 stalled cycles per insn ( +- 0.09% ) (50.04%)
3897255686 branches # 560980426.597 M/sec ( +- 0.06% ) (66.70%)
117045395 branch-misses # 3.00% of all branches ( +- 0.47% ) (83.34%)
6.9507 +- 0.0627 seconds time elapsed ( +- 0.90% )
```
I.e. it's +2.6% slowdown for one whole sweep, or +2% for 5 whole sweeps.
Within noise i'd say.
Should help with [[ https://bugs.llvm.org/show_bug.cgi?id=40787 | PR40787 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, RKSimon, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58476
llvm-svn: 354767
2019-02-25 17:36:12 +08:00
|
|
|
AnalysisInconsistencyEpsilon,
|
[llvm-exegesis] Opcode stabilization / reclusterization (PR40715)
Summary:
Given an instruction `Opcode`, we can make benchmarks (measurements) of the
instruction characteristics/performance. Then, to facilitate further analysis
we group the benchmarks with *similar* characteristics into clusters.
Now, this is all not entirely deterministic. Some instructions have variable
characteristics, depending on their arguments. And thus, if we do several
benchmarks of the same instruction `Opcode`, we may end up with *different*
performance characteristics measurements. And when we then do clustering,
these several benchmarks of the same instruction `Opcode` may end up being
clustered into *different* clusters. This is not great for further analysis.
We shall find every `Opcode` with benchmarks not in just one cluster, and move
*all* the benchmarks of said `Opcode` into one new unstable cluster per `Opcode`.
I have solved this by making `ClusterId` a bit field, adding a `IsUnstable` bit,
and introducing `-analysis-display-unstable-clusters` switch to toggle between
displaying stable-only clusters and unstable-only clusters.
The reclusterization is deterministically stable, produces identical reports
between runs. (Or at least that is what i'm seeing, maybe it isn't)
Timings/comparisons:
old (current trunk/head) {F8303582}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-old.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-old.html' (25 runs):
6624.73 msec task-clock # 0.999 CPUs utilized ( +- 0.53% )
172 context-switches # 25.965 M/sec ( +- 29.89% )
0 cpu-migrations # 0.042 M/sec ( +- 56.54% )
31073 page-faults # 4690.754 M/sec ( +- 0.08% )
26538711696 cycles # 4006230.292 GHz ( +- 0.53% ) (83.31%)
2017496807 stalled-cycles-frontend # 7.60% frontend cycles idle ( +- 0.93% ) (83.32%)
13403650062 stalled-cycles-backend # 50.51% backend cycles idle ( +- 0.33% ) (33.37%)
19770706799 instructions # 0.74 insn per cycle
# 0.68 stalled cycles per insn ( +- 0.04% ) (50.04%)
4419821812 branches # 667207369.714 M/sec ( +- 0.03% ) (66.69%)
121741669 branch-misses # 2.75% of all branches ( +- 0.28% ) (83.34%)
6.6283 +- 0.0358 seconds time elapsed ( +- 0.54% )
```
patch, with reclustering but without filtering (i.e. outputting all the stable *and* unstable clusters) {F8303586}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-all.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-all.html' (25 runs):
6475.29 msec task-clock # 0.999 CPUs utilized ( +- 0.31% )
213 context-switches # 32.952 M/sec ( +- 23.81% )
1 cpu-migrations # 0.130 M/sec ( +- 43.84% )
31287 page-faults # 4832.057 M/sec ( +- 0.08% )
25939086577 cycles # 4006160.279 GHz ( +- 0.31% ) (83.31%)
1958812858 stalled-cycles-frontend # 7.55% frontend cycles idle ( +- 0.68% ) (83.32%)
13218961512 stalled-cycles-backend # 50.96% backend cycles idle ( +- 0.29% ) (33.37%)
19752995402 instructions # 0.76 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.04%)
4417079244 branches # 682195472.305 M/sec ( +- 0.03% ) (66.70%)
121510065 branch-misses # 2.75% of all branches ( +- 0.19% ) (83.34%)
6.4832 +- 0.0229 seconds time elapsed ( +- 0.35% )
```
Funnily, *this* measurement shows that said reclustering actually improved performance.
patch, with reclustering, only the stable clusters {F8303594}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-stable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-stable.html' (25 runs):
6387.71 msec task-clock # 0.999 CPUs utilized ( +- 0.13% )
133 context-switches # 20.792 M/sec ( +- 23.39% )
0 cpu-migrations # 0.063 M/sec ( +- 61.24% )
31318 page-faults # 4903.256 M/sec ( +- 0.08% )
25591984967 cycles # 4006786.266 GHz ( +- 0.13% ) (83.31%)
1881234904 stalled-cycles-frontend # 7.35% frontend cycles idle ( +- 0.25% ) (83.33%)
13209749965 stalled-cycles-backend # 51.62% backend cycles idle ( +- 0.16% ) (33.36%)
19767554347 instructions # 0.77 insn per cycle
# 0.67 stalled cycles per insn ( +- 0.04% ) (50.03%)
4417480305 branches # 691618858.046 M/sec ( +- 0.03% ) (66.68%)
118676358 branch-misses # 2.69% of all branches ( +- 0.07% ) (83.33%)
6.3954 +- 0.0118 seconds time elapsed ( +- 0.18% )
```
Performance improved even further?! Makes sense i guess, less clusters to print.
patch, with reclustering, only the unstable clusters {F8303601}
```
$ perf stat -r 25 ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
...
no exegesis target for x86_64-unknown-linux-gnu, using default
Parsed 43970 benchmark points
Printing sched class consistency analysis results to file '/tmp/clusters-new-unstable.html'
Performance counter stats for './bin/llvm-exegesis -mode=analysis -analysis-epsilon=0.5 -benchmarks-file=/home/lebedevri/PileDriver-Sched/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters-new-unstable.html -analysis-display-unstable-clusters' (25 runs):
6124.96 msec task-clock # 1.000 CPUs utilized ( +- 0.20% )
194 context-switches # 31.709 M/sec ( +- 20.46% )
0 cpu-migrations # 0.039 M/sec ( +- 49.77% )
31413 page-faults # 5129.261 M/sec ( +- 0.06% )
24536794267 cycles # 4006425.858 GHz ( +- 0.19% ) (83.31%)
1676085087 stalled-cycles-frontend # 6.83% frontend cycles idle ( +- 0.46% ) (83.32%)
13035595603 stalled-cycles-backend # 53.13% backend cycles idle ( +- 0.16% ) (33.36%)
18260877653 instructions # 0.74 insn per cycle
# 0.71 stalled cycles per insn ( +- 0.05% ) (50.03%)
4112411983 branches # 671484364.603 M/sec ( +- 0.03% ) (66.68%)
114066929 branch-misses # 2.77% of all branches ( +- 0.11% ) (83.32%)
6.1278 +- 0.0121 seconds time elapsed ( +- 0.20% )
```
This tells us that the actual `-analysis-inconsistencies-output-file=` outputting only takes ~0.4 sec for 43970 benchmark points (3 whole sweeps)
(Also, wow this is fast, it used to take several minutes originally)
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40715 | PR40715 ]].
Reviewers: courbet, gchatelet
Reviewed By: courbet
Subscribers: tschuett, jdoerfert, llvm-commits, RKSimon
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D58355
llvm-svn: 354441
2019-02-20 17:14:04 +08:00
|
|
|
AnalysisDisplayUnstableOpcodes);
|
2018-05-16 16:47:21 +08:00
|
|
|
|
2018-05-17 21:41:28 +08:00
|
|
|
maybeRunAnalysis<Analysis::PrintClusters>(Analyzer, "analysis clusters",
|
|
|
|
AnalysisClustersOutputFile);
|
|
|
|
maybeRunAnalysis<Analysis::PrintSchedClassInconsistencies>(
|
|
|
|
Analyzer, "sched class consistency analysis",
|
|
|
|
AnalysisInconsistenciesOutputFile);
|
2018-04-04 19:37:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace exegesis
|
2018-10-23 01:10:47 +08:00
|
|
|
} // namespace llvm
|
2018-04-04 19:37:06 +08:00
|
|
|
|
|
|
|
int main(int Argc, char **Argv) {
|
2018-10-23 01:10:47 +08:00
|
|
|
using namespace llvm;
|
|
|
|
cl::ParseCommandLineOptions(Argc, Argv, "");
|
2018-04-04 19:37:06 +08:00
|
|
|
|
2018-06-11 17:18:01 +08:00
|
|
|
exegesis::ExitOnErr.setExitCodeMapper([](const llvm::Error &Err) {
|
|
|
|
if (Err.isA<llvm::StringError>())
|
|
|
|
return EXIT_SUCCESS;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
});
|
|
|
|
|
2018-10-23 01:10:47 +08:00
|
|
|
if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::Unknown) {
|
2018-05-15 20:08:00 +08:00
|
|
|
exegesis::analysisMain();
|
|
|
|
} else {
|
|
|
|
exegesis::benchmarkMain();
|
2018-04-04 19:37:06 +08:00
|
|
|
}
|
|
|
|
return EXIT_SUCCESS;
|
|
|
|
}
|