2021-12-22 02:21:41 +08:00
|
|
|
//===- bolt/Rewrite/BinaryPassManager.cpp - Binary-level pass manager -----===//
|
2016-04-16 06:59:52 +08:00
|
|
|
//
|
2021-03-16 09:04:18 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2016-04-16 06:59:52 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2021-10-09 02:47:10 +08:00
|
|
|
#include "bolt/Rewrite/BinaryPassManager.h"
|
|
|
|
#include "bolt/Passes/ADRRelaxationPass.h"
|
|
|
|
#include "bolt/Passes/Aligner.h"
|
|
|
|
#include "bolt/Passes/AllocCombiner.h"
|
2021-09-28 01:51:25 +08:00
|
|
|
#include "bolt/Passes/AsmDump.h"
|
2021-10-09 02:47:10 +08:00
|
|
|
#include "bolt/Passes/FrameOptimizer.h"
|
|
|
|
#include "bolt/Passes/IdenticalCodeFolding.h"
|
|
|
|
#include "bolt/Passes/IndirectCallPromotion.h"
|
|
|
|
#include "bolt/Passes/Inliner.h"
|
|
|
|
#include "bolt/Passes/Instrumentation.h"
|
|
|
|
#include "bolt/Passes/JTFootprintReduction.h"
|
|
|
|
#include "bolt/Passes/LongJmp.h"
|
|
|
|
#include "bolt/Passes/LoopInversionPass.h"
|
|
|
|
#include "bolt/Passes/PLTCall.h"
|
|
|
|
#include "bolt/Passes/PatchEntries.h"
|
|
|
|
#include "bolt/Passes/RegReAssign.h"
|
|
|
|
#include "bolt/Passes/ReorderData.h"
|
|
|
|
#include "bolt/Passes/ReorderFunctions.h"
|
|
|
|
#include "bolt/Passes/RetpolineInsertion.h"
|
|
|
|
#include "bolt/Passes/SplitFunctions.h"
|
|
|
|
#include "bolt/Passes/StokeInfo.h"
|
|
|
|
#include "bolt/Passes/TailDuplication.h"
|
|
|
|
#include "bolt/Passes/ThreeWayBranch.h"
|
|
|
|
#include "bolt/Passes/ValidateInternalCalls.h"
|
|
|
|
#include "bolt/Passes/VeneerElimination.h"
|
|
|
|
#include "bolt/Utils/CommandLineOpts.h"
|
2020-10-22 08:08:32 +08:00
|
|
|
#include "llvm/Support/FormatVariadic.h"
|
2016-12-09 04:15:20 +08:00
|
|
|
#include "llvm/Support/Timer.h"
|
2017-03-21 13:44:25 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2021-09-28 01:51:25 +08:00
|
|
|
#include <memory>
|
2017-03-08 10:09:09 +08:00
|
|
|
#include <numeric>
|
2016-04-16 06:59:52 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2016-04-26 05:25:58 +08:00
|
|
|
namespace opts {
|
|
|
|
|
Indirect call promotion optimization.
Summary:
Perform indirect call promotion optimization in BOLT.
The code scans the instructions during CFG creation for all
indirect calls. Right now indirect tail calls are not handled
since the functions are marked not simple. The offsets of the
indirect calls are stored for later use by the ICP pass.
The indirect call promotion pass visits each indirect call and
examines the BranchData for each. If the most frequent targets
from that callsite exceed the specified threshold (default 90%),
the call is promoted. Otherwise, it is ignored. By default,
only one target is considered at each callsite.
When an candiate callsite is processed, we modify the callsite
to test for the most common call targets before calling through
the original generic call mechanism.
The CFG and layout are modified by ICP.
A few new command line options have been added:
-indirect-call-promotion
-indirect-call-promotion-threshold=<percentage>
-indirect-call-promotion-topn=<int>
The threshold is the minimum frequency of a call target needed
before ICP is triggered.
The topn option controls the number of targets to consider for
each callsite, e.g. ICP is triggered if topn=2 and the total
requency of the top two call targets exceeds the threshold.
Example of ICP:
C++ code:
int B_count = 0;
int C_count = 0;
struct A { virtual void foo() = 0; }
struct B : public A { virtual void foo() { ++B_count; }; };
struct C : public A { virtual void foo() { ++C_count; }; };
A* a = ...
a->foo();
...
original:
400863: 49 8b 07 mov (%r15),%rax
400866: 4c 89 ff mov %r15,%rdi
400869: ff 10 callq *(%rax)
40086b: 41 83 e6 01 and $0x1,%r14d
40086f: 4d 89 e6 mov %r12,%r14
400872: 4c 0f 44 f5 cmove %rbp,%r14
400876: 4c 89 f7 mov %r14,%rdi
...
after ICP:
40085e: 49 8b 07 mov (%r15),%rax
400861: 4c 89 ff mov %r15,%rdi
400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10
40086b: 00 00 00
40086e: 4c 3b 10 cmp (%rax),%r10
400871: 75 29 jne 40089c <main+0x9c>
400873: 41 ff d2 callq *%r10
400876: 41 83 e6 01 and $0x1,%r14d
40087a: 4d 89 e6 mov %r12,%r14
40087d: 4c 0f 44 f5 cmove %rbp,%r14
400881: 4c 89 f7 mov %r14,%rdi
...
40089c: ff 10 callq *(%rax)
40089e: eb d6 jmp 400876 <main+0x76>
(cherry picked from FBD3612218)
2016-09-08 09:59:23 +08:00
|
|
|
extern cl::opt<bool> PrintAll;
|
2017-08-11 04:18:44 +08:00
|
|
|
extern cl::opt<bool> PrintDynoStats;
|
Indirect call promotion optimization.
Summary:
Perform indirect call promotion optimization in BOLT.
The code scans the instructions during CFG creation for all
indirect calls. Right now indirect tail calls are not handled
since the functions are marked not simple. The offsets of the
indirect calls are stored for later use by the ICP pass.
The indirect call promotion pass visits each indirect call and
examines the BranchData for each. If the most frequent targets
from that callsite exceed the specified threshold (default 90%),
the call is promoted. Otherwise, it is ignored. By default,
only one target is considered at each callsite.
When an candiate callsite is processed, we modify the callsite
to test for the most common call targets before calling through
the original generic call mechanism.
The CFG and layout are modified by ICP.
A few new command line options have been added:
-indirect-call-promotion
-indirect-call-promotion-threshold=<percentage>
-indirect-call-promotion-topn=<int>
The threshold is the minimum frequency of a call target needed
before ICP is triggered.
The topn option controls the number of targets to consider for
each callsite, e.g. ICP is triggered if topn=2 and the total
requency of the top two call targets exceeds the threshold.
Example of ICP:
C++ code:
int B_count = 0;
int C_count = 0;
struct A { virtual void foo() = 0; }
struct B : public A { virtual void foo() { ++B_count; }; };
struct C : public A { virtual void foo() { ++C_count; }; };
A* a = ...
a->foo();
...
original:
400863: 49 8b 07 mov (%r15),%rax
400866: 4c 89 ff mov %r15,%rdi
400869: ff 10 callq *(%rax)
40086b: 41 83 e6 01 and $0x1,%r14d
40086f: 4d 89 e6 mov %r12,%r14
400872: 4c 0f 44 f5 cmove %rbp,%r14
400876: 4c 89 f7 mov %r14,%rdi
...
after ICP:
40085e: 49 8b 07 mov (%r15),%rax
400861: 4c 89 ff mov %r15,%rdi
400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10
40086b: 00 00 00
40086e: 4c 3b 10 cmp (%rax),%r10
400871: 75 29 jne 40089c <main+0x9c>
400873: 41 ff d2 callq *%r10
400876: 41 83 e6 01 and $0x1,%r14d
40087a: 4d 89 e6 mov %r12,%r14
40087d: 4c 0f 44 f5 cmove %rbp,%r14
400881: 4c 89 f7 mov %r14,%rdi
...
40089c: ff 10 callq *(%rax)
40089e: eb d6 jmp 400876 <main+0x76>
(cherry picked from FBD3612218)
2016-09-08 09:59:23 +08:00
|
|
|
extern cl::opt<bool> DumpDotAll;
|
2021-09-28 01:51:25 +08:00
|
|
|
extern cl::opt<std::string> AsmDump;
|
2017-08-05 02:21:05 +08:00
|
|
|
extern cl::opt<bolt::PLTCall::OptType> PLT;
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2017-03-21 13:44:25 +08:00
|
|
|
static cl::opt<bool>
|
2017-08-11 04:18:44 +08:00
|
|
|
DynoStatsAll("dyno-stats-all",
|
|
|
|
cl::desc("print dyno stats after each stage"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
|
2017-03-21 13:44:25 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
static cl::opt<bool>
|
2016-05-03 03:47:18 +08:00
|
|
|
EliminateUnreachable("eliminate-unreachable",
|
2017-03-29 05:40:20 +08:00
|
|
|
cl::desc("eliminate unreachable code"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2016-05-03 03:47:18 +08:00
|
|
|
|
2017-12-08 07:00:41 +08:00
|
|
|
cl::opt<bool>
|
2017-08-11 04:18:44 +08:00
|
|
|
ICF("icf",
|
|
|
|
cl::desc("fold functions with identical code"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-08-11 04:18:44 +08:00
|
|
|
|
2017-11-02 15:30:11 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
JTFootprintReductionFlag("jt-footprint-reduction",
|
|
|
|
cl::desc("make jump tables size smaller at the cost of using more "
|
|
|
|
"instructions at jump sites"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-11-02 15:30:11 +08:00
|
|
|
|
2017-12-08 07:00:41 +08:00
|
|
|
cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
NeverPrint("never-print",
|
|
|
|
cl::desc("never print"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory));
|
2016-04-26 05:25:58 +08:00
|
|
|
|
2020-06-13 11:16:27 +08:00
|
|
|
cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintAfterBranchFixup("print-after-branch-fixup",
|
|
|
|
cl::desc("print function after fixing local branches"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
2016-06-03 15:58:11 +08:00
|
|
|
|
2017-02-24 10:09:10 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintAfterLowering("print-after-lowering",
|
|
|
|
cl::desc("print function after instruction lowering"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
2016-12-06 03:47:08 +08:00
|
|
|
|
2020-02-18 05:35:09 +08:00
|
|
|
cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintFinalized("print-finalized",
|
|
|
|
cl::desc("print function after CFG is finalized"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2017-09-01 02:45:37 +08:00
|
|
|
static cl::opt<bool>
|
2022-01-08 03:36:22 +08:00
|
|
|
PrintFOP("print-fop",
|
|
|
|
cl::desc("print functions after frame optimizer pass"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2017-09-01 02:45:37 +08:00
|
|
|
|
2017-03-04 03:35:41 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintICF("print-icf",
|
|
|
|
cl::desc("print functions after ICF optimization"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2017-03-04 03:35:41 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintICP("print-icp",
|
|
|
|
cl::desc("print functions after indirect call promotion"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2022-01-08 03:36:22 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
PrintInline("print-inline",
|
|
|
|
cl::desc("print functions after inlining optimization"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2021-12-02 05:57:50 +08:00
|
|
|
|
2017-11-15 10:20:40 +08:00
|
|
|
static cl::opt<bool>
|
2022-01-08 03:36:22 +08:00
|
|
|
PrintJTFootprintReduction("print-after-jt-footprint-reduction",
|
|
|
|
cl::desc("print function after jt-footprint-reduction pass"),
|
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-11-15 10:20:40 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
static cl::opt<bool>
|
2022-01-08 03:36:22 +08:00
|
|
|
PrintLongJmp("print-longjmp",
|
|
|
|
cl::desc("print functions after longjmp pass"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
cl::opt<bool>
|
|
|
|
PrintNormalized("print-normalized",
|
|
|
|
cl::desc("print functions after CFG is normalized"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintOptimizeBodyless("print-optimize-bodyless",
|
|
|
|
cl::desc("print functions after bodyless optimization"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintPeepholes("print-peepholes",
|
|
|
|
cl::desc("print functions after peephole optimization"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2017-08-05 02:21:05 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
PrintPLT("print-plt",
|
|
|
|
cl::desc("print functions after PLT optimization"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2017-08-05 02:21:05 +08:00
|
|
|
|
2017-02-14 15:05:12 +08:00
|
|
|
static cl::opt<bool>
|
2022-01-08 03:36:22 +08:00
|
|
|
PrintProfileStats("print-profile-stats",
|
|
|
|
cl::desc("print profile quality/bias analysis"),
|
|
|
|
cl::ZeroOrMore, cl::init(false), cl::cat(BoltCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintRegReAssign("print-regreassign",
|
|
|
|
cl::desc("print functions after regreassign pass"),
|
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2017-02-14 15:05:12 +08:00
|
|
|
|
2020-02-18 05:35:09 +08:00
|
|
|
cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintReordered("print-reordered",
|
|
|
|
cl::desc("print functions after layout optimization"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintReorderedFunctions("print-reordered-functions",
|
|
|
|
cl::desc("print functions after clustering"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintRetpolineInsertion("print-retpoline-insertion",
|
|
|
|
cl::desc("print functions after retpoline insertion pass"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintSCTC("print-sctc",
|
|
|
|
cl::desc("print functions after conditional tail call simplification"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintSimplifyROLoads("print-simplify-rodata-loads",
|
2017-03-29 05:40:20 +08:00
|
|
|
cl::desc("print functions after simplification of RO data loads"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
[BOLT][non-reloc] Change function splitting in non-relocation mode
Summary:
This diff applies to non-relocation mode mostly. In this mode, we are
limited by original function boundaries, i.e. if a function becomes
larger after optimizations (e.g. because of the newly introduced
branches) then we might not be able to write the optimized version,
unless we split the function. At the same time, we do not benefit from
function splitting as we do in the relocation mode since we are not
moving functions/fragments, and the hot code does not become more
compact.
For the reasons described above, we used to execute multiple re-write
attempts to optimize the binary and we would only split functions that
were too large to fit into their original space.
After the first attempt, we would know functions that did not fit
into their original space. Then we would re-run all our passes again
feeding back the function information and forcefully splitting
such functions. Some functions still wouldn't fit even after the
splitting (mostly because of the branch relaxation for conditional tail
calls that does not happen in non-relocation mode). Yet we have emitted
debug info as if they were successfully overwritten. That's why we had
one more stage to write the functions again, marking failed-to-emit
functions non-simple. Sadly, there was a bug in the way 2nd and 3rd
attempts interacted, and we were not splitting the functions correctly
and as a result we were emitting less optimized code.
One of the reasons we had the multi-pass rewrite scheme in place, was
that we did not have an ability to precisely estimate the code size
before the actual code emission. Recently, BinaryContext obtained such
functionality, and now we can use it instead of relying on the
multi-pass rewrite. This eliminates redundant work of re-running
the same function passes multiple times.
Because function splitting runs before a number of optimization passes
that run on post-CFG state (those rely on the splitting pass), we
cannot estimate the non-split code size with 100% accuracy. However,
it is good enough for over 99% of the cases to extract most of the
performance gains for the binary.
As a result of eliminating the multi-pass rewrite, the processing time
in non-relocation mode with `-split-functions=2` is greatly reduced.
With debug info update, it is less than half of what it used to be.
New semantics for `-split-functions=<n>`:
-split-functions - split functions into hot and cold regions
=0 - do not split any function
=1 - in non-relocation mode only split functions too large to fit
into original code space
=2 - same as 1 (backwards compatibility)
=3 - split all functions
(cherry picked from FBD17362607)
2019-09-12 06:42:22 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
PrintSplit("print-split",
|
|
|
|
cl::desc("print functions after code splitting"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintStoke("print-stoke",
|
|
|
|
cl::desc("print functions after stoke analysis"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintVeneerElimination("print-veneer-elimination",
|
|
|
|
cl::desc("print functions after veneer elimination pass"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
[BOLT][non-reloc] Change function splitting in non-relocation mode
Summary:
This diff applies to non-relocation mode mostly. In this mode, we are
limited by original function boundaries, i.e. if a function becomes
larger after optimizations (e.g. because of the newly introduced
branches) then we might not be able to write the optimized version,
unless we split the function. At the same time, we do not benefit from
function splitting as we do in the relocation mode since we are not
moving functions/fragments, and the hot code does not become more
compact.
For the reasons described above, we used to execute multiple re-write
attempts to optimize the binary and we would only split functions that
were too large to fit into their original space.
After the first attempt, we would know functions that did not fit
into their original space. Then we would re-run all our passes again
feeding back the function information and forcefully splitting
such functions. Some functions still wouldn't fit even after the
splitting (mostly because of the branch relaxation for conditional tail
calls that does not happen in non-relocation mode). Yet we have emitted
debug info as if they were successfully overwritten. That's why we had
one more stage to write the functions again, marking failed-to-emit
functions non-simple. Sadly, there was a bug in the way 2nd and 3rd
attempts interacted, and we were not splitting the functions correctly
and as a result we were emitting less optimized code.
One of the reasons we had the multi-pass rewrite scheme in place, was
that we did not have an ability to precisely estimate the code size
before the actual code emission. Recently, BinaryContext obtained such
functionality, and now we can use it instead of relying on the
multi-pass rewrite. This eliminates redundant work of re-running
the same function passes multiple times.
Because function splitting runs before a number of optimization passes
that run on post-CFG state (those rely on the splitting pass), we
cannot estimate the non-split code size with 100% accuracy. However,
it is good enough for over 99% of the cases to extract most of the
performance gains for the binary.
As a result of eliminating the multi-pass rewrite, the processing time
in non-relocation mode with `-split-functions=2` is greatly reduced.
With debug info update, it is less than half of what it used to be.
New semantics for `-split-functions=<n>`:
-split-functions - split functions into hot and cold regions
=0 - do not split any function
=1 - in non-relocation mode only split functions too large to fit
into original code space
=2 - same as 1 (backwards compatibility)
=3 - split all functions
(cherry picked from FBD17362607)
2019-09-12 06:42:22 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
PrintUCE("print-uce",
|
|
|
|
cl::desc("print functions after unreachable code elimination"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
2016-09-10 03:37:37 +08:00
|
|
|
|
[BOLT] Add option to print profile bias stats
Summary:
Profile bias may happen depending on the hardware counter used
to trigger LBR sampling, on the hardware implementation and as an
intrinsic characteristic of relying on LBRs. Since we infer fall-through
execution and these non-taken branches take zero hardware resources to
be represented, LBR-based profile likely overrepresents paths with fall
throughs and underrepresents paths with many taken branches. This patch
adds an option to print statistics about profile bias so we can better
understand these biases.
The goal is to analyze differences in the sum of the frequency of all
incoming edges in a basic block versus the sum of all outgoing. In an
ideally sampled profile, these differences should be close to zero. With
this option, the user gets the mean of these differences in flow as a
percentage of the input flow. For example, if this number is 15%, it
means, on average, a block observed 15% more or less flow going out of
it in comparison with the flow going in. We also print the standard
deviation so we can have an idea of how spread apart are different
measurements of flow differences. If variance is low, it means the
average bias is happening across all blocks, which is compatible with
using LBRs. If the variance is high, it means some blocks in the profile
have a much higher bias than others, which is compatible with using a
biased event such as cycles to sample LBRs because it overrepresents
paths that end in an expensive instruction.
(cherry picked from FBD15790517)
2019-06-11 08:26:48 +08:00
|
|
|
static cl::opt<bool>
|
2022-01-08 03:36:22 +08:00
|
|
|
RegReAssign("reg-reassign",
|
|
|
|
cl::desc("reassign registers so as to avoid using REX prefixes in hot code"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
[BOLT] Add option to print profile bias stats
Summary:
Profile bias may happen depending on the hardware counter used
to trigger LBR sampling, on the hardware implementation and as an
intrinsic characteristic of relying on LBRs. Since we infer fall-through
execution and these non-taken branches take zero hardware resources to
be represented, LBR-based profile likely overrepresents paths with fall
throughs and underrepresents paths with many taken branches. This patch
adds an option to print statistics about profile bias so we can better
understand these biases.
The goal is to analyze differences in the sum of the frequency of all
incoming edges in a basic block versus the sum of all outgoing. In an
ideally sampled profile, these differences should be close to zero. With
this option, the user gets the mean of these differences in flow as a
percentage of the input flow. For example, if this number is 15%, it
means, on average, a block observed 15% more or less flow going out of
it in comparison with the flow going in. We also print the standard
deviation so we can have an idea of how spread apart are different
measurements of flow differences. If variance is low, it means the
average bias is happening across all blocks, which is compatible with
using LBRs. If the variance is high, it means some blocks in the profile
have a much higher bias than others, which is compatible with using a
biased event such as cycles to sample LBRs because it overrepresents
paths that end in an expensive instruction.
(cherry picked from FBD15790517)
2019-06-11 08:26:48 +08:00
|
|
|
|
Indirect call promotion optimization.
Summary:
Perform indirect call promotion optimization in BOLT.
The code scans the instructions during CFG creation for all
indirect calls. Right now indirect tail calls are not handled
since the functions are marked not simple. The offsets of the
indirect calls are stored for later use by the ICP pass.
The indirect call promotion pass visits each indirect call and
examines the BranchData for each. If the most frequent targets
from that callsite exceed the specified threshold (default 90%),
the call is promoted. Otherwise, it is ignored. By default,
only one target is considered at each callsite.
When an candiate callsite is processed, we modify the callsite
to test for the most common call targets before calling through
the original generic call mechanism.
The CFG and layout are modified by ICP.
A few new command line options have been added:
-indirect-call-promotion
-indirect-call-promotion-threshold=<percentage>
-indirect-call-promotion-topn=<int>
The threshold is the minimum frequency of a call target needed
before ICP is triggered.
The topn option controls the number of targets to consider for
each callsite, e.g. ICP is triggered if topn=2 and the total
requency of the top two call targets exceeds the threshold.
Example of ICP:
C++ code:
int B_count = 0;
int C_count = 0;
struct A { virtual void foo() = 0; }
struct B : public A { virtual void foo() { ++B_count; }; };
struct C : public A { virtual void foo() { ++C_count; }; };
A* a = ...
a->foo();
...
original:
400863: 49 8b 07 mov (%r15),%rax
400866: 4c 89 ff mov %r15,%rdi
400869: ff 10 callq *(%rax)
40086b: 41 83 e6 01 and $0x1,%r14d
40086f: 4d 89 e6 mov %r12,%r14
400872: 4c 0f 44 f5 cmove %rbp,%r14
400876: 4c 89 f7 mov %r14,%rdi
...
after ICP:
40085e: 49 8b 07 mov (%r15),%rax
400861: 4c 89 ff mov %r15,%rdi
400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10
40086b: 00 00 00
40086e: 4c 3b 10 cmp (%rax),%r10
400871: 75 29 jne 40089c <main+0x9c>
400873: 41 ff d2 callq *%r10
400876: 41 83 e6 01 and $0x1,%r14d
40087a: 4d 89 e6 mov %r12,%r14
40087d: 4c 0f 44 f5 cmove %rbp,%r14
400881: 4c 89 f7 mov %r14,%rdi
...
40089c: ff 10 callq *(%rax)
40089e: eb d6 jmp 400876 <main+0x76>
(cherry picked from FBD3612218)
2016-09-08 09:59:23 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
|
|
|
|
cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-02-25 13:59:33 +08:00
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
SimplifyRODataLoads("simplify-rodata-loads",
|
|
|
|
cl::desc("simplify loads from read-only sections by replacing the memory "
|
|
|
|
"operand with the constant found in the corresponding section"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2018-05-27 03:40:51 +08:00
|
|
|
|
2019-05-21 11:11:40 +08:00
|
|
|
static cl::list<std::string>
|
|
|
|
SpecializeMemcpy1("memcpy1-spec",
|
|
|
|
cl::desc("list of functions with call sites for which to specialize memcpy() "
|
|
|
|
"for size 1"),
|
|
|
|
cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
Stoke("stoke",
|
|
|
|
cl::desc("turn on the stoke analysis"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
StringOps("inline-memcpy",
|
|
|
|
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2019-05-21 11:11:40 +08:00
|
|
|
|
2016-12-06 03:47:08 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
StripRepRet("strip-rep-ret",
|
|
|
|
cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2016-12-06 03:47:08 +08:00
|
|
|
|
2022-01-08 03:36:22 +08:00
|
|
|
static cl::opt<bool>
|
2017-03-29 05:40:20 +08:00
|
|
|
VerifyCFG("verify-cfg",
|
|
|
|
cl::desc("verify the CFG after every pass"),
|
2022-01-08 03:36:22 +08:00
|
|
|
cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2016-09-21 11:55:49 +08:00
|
|
|
|
2022-01-08 03:36:22 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
TailDuplicationFlag("tail-duplication",
|
|
|
|
cl::desc("duplicate unconditional branches that cross a cache line"),
|
|
|
|
cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory));
|
2017-06-14 08:24:27 +08:00
|
|
|
|
2022-01-08 03:36:22 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
ThreeWayBranchFlag("three-way-branch",
|
|
|
|
cl::desc("reorder three way branches"),
|
|
|
|
cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory));
|
2018-07-26 10:07:41 +08:00
|
|
|
|
2016-04-26 05:25:58 +08:00
|
|
|
} // namespace opts
|
|
|
|
|
2016-04-16 06:59:52 +08:00
|
|
|
namespace llvm {
|
|
|
|
namespace bolt {
|
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
using namespace opts;
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
const char BinaryFunctionPassManager::TimerGroupName[] = "passman";
|
[BOLT rebase] Rebase fixes on top of LLVM Feb2018
Summary:
This commit includes all code necessary to make BOLT working again
after the rebase. This includes a redesign of the EHFrame work,
cherry-pick of the 3dnow disassembly work, compilation error fixes,
and port of the debug_info work. The macroop fusion feature is not
ported yet.
The rebased version has minor changes to the "executed instructions"
dynostats counter because REP prefixes are considered a part of the
instruction it applies to. Also, some X86 instructions had the "mayLoad"
tablegen property removed, which BOLT uses to identify and account
for loads, thus reducing the total number of loads reported by
dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are
not terminators anymore, changing our CFG. This commit adds compensation
to preserve this old behavior and minimize tests changes. debug_info
sections are now slightly larger. The discriminator field in the line
table is slightly different due to a change upstream. New profiles
generated with the other bolt are incompatible with this version
because of different hash values calculated for functions, so they will
be considered 100% stale. This commit changes the corresponding test
to XFAIL so it can be updated. The hash function changes because it
relies on raw opcode values, which change according to the opcodes
described in the X86 tablegen files. When processing HHVM, bolt was
observed to be using about 800MB more memory in the rebased version
and being about 5% slower.
(cherry picked from FBD7078072)
2018-02-07 07:00:23 +08:00
|
|
|
const char BinaryFunctionPassManager::TimerGroupDesc[] =
|
2016-12-09 04:15:20 +08:00
|
|
|
"Binary Function Pass Manager";
|
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
void BinaryFunctionPassManager::runPasses() {
|
2019-04-04 06:52:01 +08:00
|
|
|
auto &BFs = BC.getBinaryFunctions();
|
2020-10-22 08:08:32 +08:00
|
|
|
for (size_t PassIdx = 0; PassIdx < Passes.size(); PassIdx++) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const std::pair<const bool, std::unique_ptr<BinaryFunctionPass>>
|
|
|
|
&OptPassPair = Passes[PassIdx];
|
2016-09-10 03:37:37 +08:00
|
|
|
if (!OptPassPair.first)
|
|
|
|
continue;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
const std::unique_ptr<BinaryFunctionPass> &Pass = OptPassPair.second;
|
|
|
|
std::string PassIdName =
|
|
|
|
formatv("{0:2}_{1}", PassIdx, Pass->getName()).str();
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2021-12-24 04:38:33 +08:00
|
|
|
if (opts::Verbosity > 0)
|
2017-03-21 13:44:25 +08:00
|
|
|
outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
|
|
|
|
|
[BOLT rebase] Rebase fixes on top of LLVM Feb2018
Summary:
This commit includes all code necessary to make BOLT working again
after the rebase. This includes a redesign of the EHFrame work,
cherry-pick of the 3dnow disassembly work, compilation error fixes,
and port of the debug_info work. The macroop fusion feature is not
ported yet.
The rebased version has minor changes to the "executed instructions"
dynostats counter because REP prefixes are considered a part of the
instruction it applies to. Also, some X86 instructions had the "mayLoad"
tablegen property removed, which BOLT uses to identify and account
for loads, thus reducing the total number of loads reported by
dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are
not terminators anymore, changing our CFG. This commit adds compensation
to preserve this old behavior and minimize tests changes. debug_info
sections are now slightly larger. The discriminator field in the line
table is slightly different due to a change upstream. New profiles
generated with the other bolt are incompatible with this version
because of different hash values calculated for functions, so they will
be considered 100% stale. This commit changes the corresponding test
to XFAIL so it can be updated. The hash function changes because it
relies on raw opcode values, which change according to the opcodes
described in the X86 tablegen files. When processing HHVM, bolt was
observed to be using about 800MB more memory in the rebased version
and being about 5% slower.
(cherry picked from FBD7078072)
2018-02-07 07:00:23 +08:00
|
|
|
NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
|
|
|
|
TimerGroupDesc, TimeOpts);
|
2016-12-09 04:15:20 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
callWithDynoStats([this, &Pass] { Pass->runOnFunctions(BC); }, BFs,
|
|
|
|
Pass->getName(), opts::DynoStatsAll);
|
2016-09-10 03:37:37 +08:00
|
|
|
|
2017-02-28 13:44:38 +08:00
|
|
|
if (opts::VerifyCFG &&
|
|
|
|
!std::accumulate(
|
2021-12-15 08:52:51 +08:00
|
|
|
BFs.begin(), BFs.end(), true,
|
|
|
|
[](const bool Valid,
|
|
|
|
const std::pair<const uint64_t, BinaryFunction> &It) {
|
|
|
|
return Valid && It.second.validateCFG();
|
|
|
|
})) {
|
2017-02-28 13:44:38 +08:00
|
|
|
errs() << "BOLT-ERROR: Invalid CFG detected after pass "
|
|
|
|
<< Pass->getName() << "\n";
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2021-12-24 04:38:33 +08:00
|
|
|
if (opts::Verbosity > 0)
|
2017-03-21 13:44:25 +08:00
|
|
|
outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
|
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const std::string Message = std::string("after ") + Pass->getName();
|
|
|
|
|
|
|
|
for (auto &It : BFs) {
|
2021-12-24 04:38:33 +08:00
|
|
|
BinaryFunction &Function = It.second;
|
2016-09-10 03:37:37 +08:00
|
|
|
|
|
|
|
if (!Pass->shouldPrint(Function))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Function.print(outs(), Message, true);
|
|
|
|
|
|
|
|
if (opts::DumpDotAll)
|
2020-10-22 08:08:32 +08:00
|
|
|
Function.dumpGraphForPass(PassIdName);
|
2016-09-10 03:37:37 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-04 06:52:01 +08:00
|
|
|
void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
|
|
|
|
BinaryFunctionPassManager Manager(BC);
|
2016-04-26 05:25:58 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
const DynoStats InitialDynoStats = getDynoStats(BC.getBinaryFunctions());
|
2017-08-11 04:18:44 +08:00
|
|
|
|
2021-09-28 01:51:25 +08:00
|
|
|
Manager.registerPass(std::make_unique<AsmDumpPass>(),
|
|
|
|
opts::AsmDump.getNumOccurrences());
|
|
|
|
|
2021-12-24 04:38:33 +08:00
|
|
|
if (opts::Instrument)
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<Instrumentation>(NeverPrint));
|
Refactor runtime library
Summary:
As we are adding more types of runtime libraries, it would be better to move the runtime library out of RewriteInstance so that it could grow separately. This also requires splitting the current implementation of Instrumentation.cpp to two separate pieces, one as normal Pass, one as the runtime library. The Instrumentation Pass would pass over the generated data to the runtime library, which will use to emit binary and perform linking.
This patch does the following:
1. Turn Instrumentation class into an optimization pass. Register the pass in the pass manager instead of in RewriteInstance.
2. Split all the data that are generated by Instrumentation that's needed by runtime library into a separate data structure called InstrumentationSummary. At the creation of Instrumentation pass, we create an instance of such data structure, which will be moved over to the runtime at the end of the pass.
3. Added a runtime library member to BinaryContext. Set the member at the end of Instrumentation pass.
4. In BinaryEmitter, make BinaryContext to also emit runtime library binary.
5. Created a base class RuntimeLibrary, that defines the interface of a runtime library, along with a few common helper functions.
6. Created InstrumentationRuntimeLibrary which inherits from RuntimeLibrary, that does all the work (mostly copied over) for emit and linking.
7. Added a new directory called RuntimeLibs, and put all the runtime library related files into it.
(cherry picked from FBD21694762)
2020-05-22 05:28:47 +08:00
|
|
|
|
Indirect call promotion optimization.
Summary:
Perform indirect call promotion optimization in BOLT.
The code scans the instructions during CFG creation for all
indirect calls. Right now indirect tail calls are not handled
since the functions are marked not simple. The offsets of the
indirect calls are stored for later use by the ICP pass.
The indirect call promotion pass visits each indirect call and
examines the BranchData for each. If the most frequent targets
from that callsite exceed the specified threshold (default 90%),
the call is promoted. Otherwise, it is ignored. By default,
only one target is considered at each callsite.
When an candiate callsite is processed, we modify the callsite
to test for the most common call targets before calling through
the original generic call mechanism.
The CFG and layout are modified by ICP.
A few new command line options have been added:
-indirect-call-promotion
-indirect-call-promotion-threshold=<percentage>
-indirect-call-promotion-topn=<int>
The threshold is the minimum frequency of a call target needed
before ICP is triggered.
The topn option controls the number of targets to consider for
each callsite, e.g. ICP is triggered if topn=2 and the total
requency of the top two call targets exceeds the threshold.
Example of ICP:
C++ code:
int B_count = 0;
int C_count = 0;
struct A { virtual void foo() = 0; }
struct B : public A { virtual void foo() { ++B_count; }; };
struct C : public A { virtual void foo() { ++C_count; }; };
A* a = ...
a->foo();
...
original:
400863: 49 8b 07 mov (%r15),%rax
400866: 4c 89 ff mov %r15,%rdi
400869: ff 10 callq *(%rax)
40086b: 41 83 e6 01 and $0x1,%r14d
40086f: 4d 89 e6 mov %r12,%r14
400872: 4c 0f 44 f5 cmove %rbp,%r14
400876: 4c 89 f7 mov %r14,%rdi
...
after ICP:
40085e: 49 8b 07 mov (%r15),%rax
400861: 4c 89 ff mov %r15,%rdi
400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10
40086b: 00 00 00
40086e: 4c 3b 10 cmp (%rax),%r10
400871: 75 29 jne 40089c <main+0x9c>
400873: 41 ff d2 callq *%r10
400876: 41 83 e6 01 and $0x1,%r14d
40087a: 4d 89 e6 mov %r12,%r14
40087d: 4c 0f 44 f5 cmove %rbp,%r14
400881: 4c 89 f7 mov %r14,%rdi
...
40089c: ff 10 callq *(%rax)
40089e: eb d6 jmp 400876 <main+0x76>
(cherry picked from FBD3612218)
2016-09-08 09:59:23 +08:00
|
|
|
// Here we manage dependencies/order manually, since passes are run in the
|
2016-04-26 05:25:58 +08:00
|
|
|
// order they're registered.
|
|
|
|
|
2016-09-21 11:55:49 +08:00
|
|
|
// Run this pass first to use stats for the original functions.
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<PrintProgramStats>(NeverPrint));
|
[BOLT] Add option to print profile bias stats
Summary:
Profile bias may happen depending on the hardware counter used
to trigger LBR sampling, on the hardware implementation and as an
intrinsic characteristic of relying on LBRs. Since we infer fall-through
execution and these non-taken branches take zero hardware resources to
be represented, LBR-based profile likely overrepresents paths with fall
throughs and underrepresents paths with many taken branches. This patch
adds an option to print statistics about profile bias so we can better
understand these biases.
The goal is to analyze differences in the sum of the frequency of all
incoming edges in a basic block versus the sum of all outgoing. In an
ideally sampled profile, these differences should be close to zero. With
this option, the user gets the mean of these differences in flow as a
percentage of the input flow. For example, if this number is 15%, it
means, on average, a block observed 15% more or less flow going out of
it in comparison with the flow going in. We also print the standard
deviation so we can have an idea of how spread apart are different
measurements of flow differences. If variance is low, it means the
average bias is happening across all blocks, which is compatible with
using LBRs. If the variance is high, it means some blocks in the profile
have a much higher bias than others, which is compatible with using a
biased event such as cycles to sample LBRs because it overrepresents
paths that end in an expensive instruction.
(cherry picked from FBD15790517)
2019-06-11 08:26:48 +08:00
|
|
|
|
|
|
|
if (opts::PrintProfileStats)
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint));
|
2016-09-21 11:55:49 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint));
|
2018-06-12 04:18:44 +08:00
|
|
|
|
2021-12-19 09:03:35 +08:00
|
|
|
Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint));
|
|
|
|
|
|
|
|
Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint));
|
|
|
|
|
2021-12-02 05:57:50 +08:00
|
|
|
Manager.registerPass(std::make_unique<NormalizeCFG>(PrintNormalized));
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<StripRepRet>(NeverPrint),
|
2017-02-24 10:09:10 +08:00
|
|
|
opts::StripRepRet);
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
|
2017-03-21 13:44:25 +08:00
|
|
|
opts::ICF);
|
2016-06-10 02:36:55 +08:00
|
|
|
|
2018-06-08 02:10:37 +08:00
|
|
|
if (BC.isAArch64())
|
2021-08-20 08:07:01 +08:00
|
|
|
Manager.registerPass(
|
|
|
|
std::make_unique<VeneerElimination>(PrintVeneerElimination));
|
2019-05-21 11:11:40 +08:00
|
|
|
|
|
|
|
Manager.registerPass(
|
2020-12-02 08:29:39 +08:00
|
|
|
std::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
|
2019-05-21 11:11:40 +08:00
|
|
|
!opts::SpecializeMemcpy1.empty());
|
2018-06-08 02:10:37 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<InlineMemcpy>(NeverPrint),
|
2018-05-27 03:40:51 +08:00
|
|
|
opts::StringOps);
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<IndirectCallPromotion>(PrintICP));
|
Indirect call promotion optimization.
Summary:
Perform indirect call promotion optimization in BOLT.
The code scans the instructions during CFG creation for all
indirect calls. Right now indirect tail calls are not handled
since the functions are marked not simple. The offsets of the
indirect calls are stored for later use by the ICP pass.
The indirect call promotion pass visits each indirect call and
examines the BranchData for each. If the most frequent targets
from that callsite exceed the specified threshold (default 90%),
the call is promoted. Otherwise, it is ignored. By default,
only one target is considered at each callsite.
When an candiate callsite is processed, we modify the callsite
to test for the most common call targets before calling through
the original generic call mechanism.
The CFG and layout are modified by ICP.
A few new command line options have been added:
-indirect-call-promotion
-indirect-call-promotion-threshold=<percentage>
-indirect-call-promotion-topn=<int>
The threshold is the minimum frequency of a call target needed
before ICP is triggered.
The topn option controls the number of targets to consider for
each callsite, e.g. ICP is triggered if topn=2 and the total
requency of the top two call targets exceeds the threshold.
Example of ICP:
C++ code:
int B_count = 0;
int C_count = 0;
struct A { virtual void foo() = 0; }
struct B : public A { virtual void foo() { ++B_count; }; };
struct C : public A { virtual void foo() { ++C_count; }; };
A* a = ...
a->foo();
...
original:
400863: 49 8b 07 mov (%r15),%rax
400866: 4c 89 ff mov %r15,%rdi
400869: ff 10 callq *(%rax)
40086b: 41 83 e6 01 and $0x1,%r14d
40086f: 4d 89 e6 mov %r12,%r14
400872: 4c 0f 44 f5 cmove %rbp,%r14
400876: 4c 89 f7 mov %r14,%rdi
...
after ICP:
40085e: 49 8b 07 mov (%r15),%rax
400861: 4c 89 ff mov %r15,%rdi
400864: 49 ba e0 0b 40 00 00 movabs $0x400be0,%r10
40086b: 00 00 00
40086e: 4c 3b 10 cmp (%rax),%r10
400871: 75 29 jne 40089c <main+0x9c>
400873: 41 ff d2 callq *%r10
400876: 41 83 e6 01 and $0x1,%r14d
40087a: 4d 89 e6 mov %r12,%r14
40087d: 4c 0f 44 f5 cmove %rbp,%r14
400881: 4c 89 f7 mov %r14,%rdi
...
40089c: ff 10 callq *(%rax)
40089e: eb d6 jmp 400876 <main+0x76>
(cherry picked from FBD3612218)
2016-09-08 09:59:23 +08:00
|
|
|
|
2017-11-02 15:30:11 +08:00
|
|
|
Manager.registerPass(
|
2020-12-02 08:29:39 +08:00
|
|
|
std::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
|
2017-11-02 15:30:11 +08:00
|
|
|
opts::JTFootprintReductionFlag);
|
|
|
|
|
2016-09-10 03:37:37 +08:00
|
|
|
Manager.registerPass(
|
2021-12-15 08:52:51 +08:00
|
|
|
std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
|
|
|
|
opts::SimplifyRODataLoads);
|
2016-06-03 15:58:11 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<RegReAssign>(PrintRegReAssign),
|
2017-11-15 10:20:40 +08:00
|
|
|
opts::RegReAssign);
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<Inliner>(PrintInline));
|
2019-02-01 03:23:02 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
|
2017-03-21 13:44:25 +08:00
|
|
|
opts::ICF);
|
2016-12-22 09:13:56 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<PLTCall>(PrintPLT));
|
2017-08-05 02:21:05 +08:00
|
|
|
|
2021-08-18 01:15:21 +08:00
|
|
|
Manager.registerPass(std::make_unique<ThreeWayBranch>(),
|
|
|
|
opts::ThreeWayBranchFlag);
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<ReorderBasicBlocks>(PrintReordered));
|
2016-05-03 03:47:18 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
Manager.registerPass(std::make_unique<EliminateUnreachableBlocks>(PrintUCE),
|
|
|
|
opts::EliminateUnreachable);
|
2016-09-03 09:09:07 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
|
[BOLT][non-reloc] Change function splitting in non-relocation mode
Summary:
This diff applies to non-relocation mode mostly. In this mode, we are
limited by original function boundaries, i.e. if a function becomes
larger after optimizations (e.g. because of the newly introduced
branches) then we might not be able to write the optimized version,
unless we split the function. At the same time, we do not benefit from
function splitting as we do in the relocation mode since we are not
moving functions/fragments, and the hot code does not become more
compact.
For the reasons described above, we used to execute multiple re-write
attempts to optimize the binary and we would only split functions that
were too large to fit into their original space.
After the first attempt, we would know functions that did not fit
into their original space. Then we would re-run all our passes again
feeding back the function information and forcefully splitting
such functions. Some functions still wouldn't fit even after the
splitting (mostly because of the branch relaxation for conditional tail
calls that does not happen in non-relocation mode). Yet we have emitted
debug info as if they were successfully overwritten. That's why we had
one more stage to write the functions again, marking failed-to-emit
functions non-simple. Sadly, there was a bug in the way 2nd and 3rd
attempts interacted, and we were not splitting the functions correctly
and as a result we were emitting less optimized code.
One of the reasons we had the multi-pass rewrite scheme in place, was
that we did not have an ability to precisely estimate the code size
before the actual code emission. Recently, BinaryContext obtained such
functionality, and now we can use it instead of relying on the
multi-pass rewrite. This eliminates redundant work of re-running
the same function passes multiple times.
Because function splitting runs before a number of optimization passes
that run on post-CFG state (those rely on the splitting pass), we
cannot estimate the non-split code size with 100% accuracy. However,
it is good enough for over 99% of the cases to extract most of the
performance gains for the binary.
As a result of eliminating the multi-pass rewrite, the processing time
in non-relocation mode with `-split-functions=2` is greatly reduced.
With debug info update, it is less than half of what it used to be.
New semantics for `-split-functions=<n>`:
-split-functions - split functions into hot and cold regions
=0 - do not split any function
=1 - in non-relocation mode only split functions too large to fit
into original code space
=2 - same as 1 (backwards compatibility)
=3 - split all functions
(cherry picked from FBD17362607)
2019-09-12 06:42:22 +08:00
|
|
|
|
2021-05-12 01:59:13 +08:00
|
|
|
Manager.registerPass(std::make_unique<LoopInversionPass>());
|
|
|
|
|
2021-07-01 22:11:26 +08:00
|
|
|
Manager.registerPass(std::make_unique<TailDuplication>(),
|
|
|
|
opts::TailDuplicationFlag);
|
|
|
|
|
2016-08-30 12:11:22 +08:00
|
|
|
// This pass syncs local branches with CFG. If any of the following
|
2016-09-09 05:52:26 +08:00
|
|
|
// passes breaks the sync - they either need to re-run the pass or
|
|
|
|
// fix branches consistency internally.
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<FixupBranches>(PrintAfterBranchFixup));
|
2016-08-30 12:11:22 +08:00
|
|
|
|
2017-03-21 13:44:25 +08:00
|
|
|
// This pass should come close to last since it uses the estimated hot
|
|
|
|
// size of a function to determine the order. It should definitely
|
|
|
|
// also happen after any changes to the call graph are made, e.g. inlining.
|
|
|
|
Manager.registerPass(
|
2021-12-15 08:52:51 +08:00
|
|
|
std::make_unique<ReorderFunctions>(PrintReorderedFunctions));
|
2017-03-21 13:44:25 +08:00
|
|
|
|
2017-08-11 04:18:44 +08:00
|
|
|
// Print final dyno stats right while CFG and instruction analysis are intact.
|
|
|
|
Manager.registerPass(
|
2021-12-15 08:52:51 +08:00
|
|
|
std::make_unique<DynoStatsPrintPass>(
|
|
|
|
InitialDynoStats, "after all optimizations before SCTC and FOP"),
|
|
|
|
opts::PrintDynoStats | opts::DynoStatsAll);
|
2017-08-11 04:18:44 +08:00
|
|
|
|
2017-09-01 08:28:14 +08:00
|
|
|
// Add the StokeInfo pass, which extract functions for stoke optimization and
|
|
|
|
// get the liveness information for them
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<StokeInfo>(PrintStoke), opts::Stoke);
|
2017-09-01 08:28:14 +08:00
|
|
|
|
2016-09-09 05:52:26 +08:00
|
|
|
// This pass introduces conditional jumps into external functions.
|
|
|
|
// Between extending CFG to support this and isolating this pass we chose
|
2017-03-21 13:44:25 +08:00
|
|
|
// the latter. Thus this pass will do double jump removal and unreachable
|
|
|
|
// code elimination if necessary and won't rely on peepholes/UCE for these
|
|
|
|
// optimizations.
|
|
|
|
// More generally this pass should be the last optimization pass that
|
|
|
|
// modifies branches/control flow. This pass is run after function
|
|
|
|
// reordering so that it can tell whether calls are forward/backward
|
|
|
|
// accurately.
|
2016-09-10 03:37:37 +08:00
|
|
|
Manager.registerPass(
|
2020-12-02 08:29:39 +08:00
|
|
|
std::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
|
2018-04-14 06:34:09 +08:00
|
|
|
opts::SimplifyConditionalTailCalls);
|
2016-05-03 03:47:18 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<Peepholes>(PrintPeepholes));
|
2019-12-04 04:28:22 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<AlignerPass>());
|
2017-10-28 06:05:31 +08:00
|
|
|
|
2018-04-21 11:03:31 +08:00
|
|
|
// Perform reordering on data contained in one or more sections using
|
|
|
|
// memory profiling data.
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<ReorderData>());
|
2018-04-21 11:03:31 +08:00
|
|
|
|
2021-10-05 00:17:01 +08:00
|
|
|
if (BC.isAArch64()) {
|
|
|
|
Manager.registerPass(std::make_unique<ADRRelaxationPass>());
|
|
|
|
|
|
|
|
// Tighten branches according to offset differences between branch and
|
|
|
|
// targets. No extra instructions after this pass, otherwise we may have
|
|
|
|
// relocations out of range and crash during linking.
|
|
|
|
Manager.registerPass(std::make_unique<LongJmpPass>(PrintLongJmp));
|
|
|
|
}
|
|
|
|
|
2017-03-21 13:44:25 +08:00
|
|
|
// This pass should always run last.*
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<FinalizeFunctions>(PrintFinalized));
|
2016-05-03 03:47:18 +08:00
|
|
|
|
2017-05-02 07:52:54 +08:00
|
|
|
// FrameOptimizer has an implicit dependency on FinalizeFunctions.
|
|
|
|
// FrameOptimizer move values around and needs to update CFIs. To do this, it
|
|
|
|
// must read CFI, interpret it and rewrite it, so CFIs need to be correctly
|
|
|
|
// placed according to the final layout.
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<FrameOptimizerPass>(PrintFOP));
|
2017-05-02 07:52:54 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<AllocCombinerPass>(PrintFOP));
|
2017-05-02 07:52:54 +08:00
|
|
|
|
2018-07-26 10:07:41 +08:00
|
|
|
Manager.registerPass(
|
2020-12-02 08:29:39 +08:00
|
|
|
std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
|
2018-07-26 10:07:41 +08:00
|
|
|
|
2019-03-16 04:43:36 +08:00
|
|
|
// Assign each function an output section.
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<AssignSections>());
|
2019-03-16 04:43:36 +08:00
|
|
|
|
[BOLT] Support for lite mode with relocations
Summary:
Add '-lite' support for relocations for improved processing time,
memory consumption, and more resilient processing of binaries with
embedded assembly code.
In lite relocation mode, BOLT will skip full processing of functions
without a profile. It will run scanExternalRefs() on such functions
to discover external references and to create internal relocations
to update references to optimized functions.
Note that we could have relied on the compiler/linker to provide
relocations for function references. However, there's no assurance
that all such references are reported. E.g., the compiler can resolve
inter-procedural references internally, leaving no relocations
for the linker.
The scan process takes about <10 seconds per 100MB of code on modern
hardware. It's a reasonable overhead to live with considering the
flexibility it provides.
If BOLT fails to scan or disassemble a function, .e.g., due to a data
object embedded in code, or an unsupported instruction, it enables a
patching mode to guarantee that the failed function will call
optimized/moved versions of functions. The patching happens at original
function entry points.
'-skip=<func1,func2,...>' option now can be used to skip processing of
arbitrary functions in the relocation mode.
With '-use-old-text' or '-strict' we require all functions to be
processed. As such, it is incompatible with '-lite' option,
and '-skip' option will only disable optimizations of listed
functions, not their disassembly and emission.
(cherry picked from FBD22040717)
2020-06-15 15:15:47 +08:00
|
|
|
// Patch original function entries
|
2020-10-10 10:37:12 +08:00
|
|
|
if (BC.HasRelocations)
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<PatchEntries>());
|
[BOLT] Support for lite mode with relocations
Summary:
Add '-lite' support for relocations for improved processing time,
memory consumption, and more resilient processing of binaries with
embedded assembly code.
In lite relocation mode, BOLT will skip full processing of functions
without a profile. It will run scanExternalRefs() on such functions
to discover external references and to create internal relocations
to update references to optimized functions.
Note that we could have relied on the compiler/linker to provide
relocations for function references. However, there's no assurance
that all such references are reported. E.g., the compiler can resolve
inter-procedural references internally, leaving no relocations
for the linker.
The scan process takes about <10 seconds per 100MB of code on modern
hardware. It's a reasonable overhead to live with considering the
flexibility it provides.
If BOLT fails to scan or disassemble a function, .e.g., due to a data
object embedded in code, or an unsupported instruction, it enables a
patching mode to guarantee that the failed function will call
optimized/moved versions of functions. The patching happens at original
function entry points.
'-skip=<func1,func2,...>' option now can be used to skip processing of
arbitrary functions in the relocation mode.
With '-use-old-text' or '-strict' we require all functions to be
processed. As such, it is incompatible with '-lite' option,
and '-skip' option will only disable optimizations of listed
functions, not their disassembly and emission.
(cherry picked from FBD22040717)
2020-06-15 15:15:47 +08:00
|
|
|
|
2017-08-11 04:18:44 +08:00
|
|
|
// This pass turns tail calls into jumps which makes them invisible to
|
|
|
|
// function reordering. It's unsafe to use any CFG or instruction analysis
|
|
|
|
// after this point.
|
2017-02-14 15:05:12 +08:00
|
|
|
Manager.registerPass(
|
2021-12-15 08:52:51 +08:00
|
|
|
std::make_unique<InstructionLowering>(PrintAfterLowering));
|
2017-02-14 15:05:12 +08:00
|
|
|
|
[BOLT][non-reloc] Change function splitting in non-relocation mode
Summary:
This diff applies to non-relocation mode mostly. In this mode, we are
limited by original function boundaries, i.e. if a function becomes
larger after optimizations (e.g. because of the newly introduced
branches) then we might not be able to write the optimized version,
unless we split the function. At the same time, we do not benefit from
function splitting as we do in the relocation mode since we are not
moving functions/fragments, and the hot code does not become more
compact.
For the reasons described above, we used to execute multiple re-write
attempts to optimize the binary and we would only split functions that
were too large to fit into their original space.
After the first attempt, we would know functions that did not fit
into their original space. Then we would re-run all our passes again
feeding back the function information and forcefully splitting
such functions. Some functions still wouldn't fit even after the
splitting (mostly because of the branch relaxation for conditional tail
calls that does not happen in non-relocation mode). Yet we have emitted
debug info as if they were successfully overwritten. That's why we had
one more stage to write the functions again, marking failed-to-emit
functions non-simple. Sadly, there was a bug in the way 2nd and 3rd
attempts interacted, and we were not splitting the functions correctly
and as a result we were emitting less optimized code.
One of the reasons we had the multi-pass rewrite scheme in place, was
that we did not have an ability to precisely estimate the code size
before the actual code emission. Recently, BinaryContext obtained such
functionality, and now we can use it instead of relying on the
multi-pass rewrite. This eliminates redundant work of re-running
the same function passes multiple times.
Because function splitting runs before a number of optimization passes
that run on post-CFG state (those rely on the splitting pass), we
cannot estimate the non-split code size with 100% accuracy. However,
it is good enough for over 99% of the cases to extract most of the
performance gains for the binary.
As a result of eliminating the multi-pass rewrite, the processing time
in non-relocation mode with `-split-functions=2` is greatly reduced.
With debug info update, it is less than half of what it used to be.
New semantics for `-split-functions=<n>`:
-split-functions - split functions into hot and cold regions
=0 - do not split any function
=1 - in non-relocation mode only split functions too large to fit
into original code space
=2 - same as 1 (backwards compatibility)
=3 - split all functions
(cherry picked from FBD17362607)
2019-09-12 06:42:22 +08:00
|
|
|
// In non-relocation mode, mark functions that do not fit into their original
|
|
|
|
// space as non-simple if we have to (e.g. for correct debug info update).
|
|
|
|
// NOTE: this pass depends on finalized code.
|
|
|
|
if (!BC.HasRelocations)
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<CheckLargeFunctions>(NeverPrint));
|
[BOLT][non-reloc] Change function splitting in non-relocation mode
Summary:
This diff applies to non-relocation mode mostly. In this mode, we are
limited by original function boundaries, i.e. if a function becomes
larger after optimizations (e.g. because of the newly introduced
branches) then we might not be able to write the optimized version,
unless we split the function. At the same time, we do not benefit from
function splitting as we do in the relocation mode since we are not
moving functions/fragments, and the hot code does not become more
compact.
For the reasons described above, we used to execute multiple re-write
attempts to optimize the binary and we would only split functions that
were too large to fit into their original space.
After the first attempt, we would know functions that did not fit
into their original space. Then we would re-run all our passes again
feeding back the function information and forcefully splitting
such functions. Some functions still wouldn't fit even after the
splitting (mostly because of the branch relaxation for conditional tail
calls that does not happen in non-relocation mode). Yet we have emitted
debug info as if they were successfully overwritten. That's why we had
one more stage to write the functions again, marking failed-to-emit
functions non-simple. Sadly, there was a bug in the way 2nd and 3rd
attempts interacted, and we were not splitting the functions correctly
and as a result we were emitting less optimized code.
One of the reasons we had the multi-pass rewrite scheme in place, was
that we did not have an ability to precisely estimate the code size
before the actual code emission. Recently, BinaryContext obtained such
functionality, and now we can use it instead of relying on the
multi-pass rewrite. This eliminates redundant work of re-running
the same function passes multiple times.
Because function splitting runs before a number of optimization passes
that run on post-CFG state (those rely on the splitting pass), we
cannot estimate the non-split code size with 100% accuracy. However,
it is good enough for over 99% of the cases to extract most of the
performance gains for the binary.
As a result of eliminating the multi-pass rewrite, the processing time
in non-relocation mode with `-split-functions=2` is greatly reduced.
With debug info update, it is less than half of what it used to be.
New semantics for `-split-functions=<n>`:
-split-functions - split functions into hot and cold regions
=0 - do not split any function
=1 - in non-relocation mode only split functions too large to fit
into original code space
=2 - same as 1 (backwards compatibility)
=3 - split all functions
(cherry picked from FBD17362607)
2019-09-12 06:42:22 +08:00
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
Manager.registerPass(std::make_unique<LowerAnnotations>(NeverPrint));
|
2017-11-10 08:59:18 +08:00
|
|
|
|
2016-04-16 06:59:52 +08:00
|
|
|
Manager.runPasses();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace bolt
|
|
|
|
} // namespace llvm
|