2021-12-22 02:21:41 +08:00
|
|
|
//===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
|
2017-03-09 11:58:33 +08:00
|
|
|
//
|
2021-03-16 09:04:18 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-03-09 11:58:33 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2021-12-22 02:21:41 +08:00
|
|
|
// This file implements the IndirectCallPromotion class.
|
|
|
|
//
|
2017-03-09 11:58:33 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2021-10-09 02:47:10 +08:00
|
|
|
#include "bolt/Passes/IndirectCallPromotion.h"
|
|
|
|
#include "bolt/Passes/BinaryFunctionCallGraph.h"
|
|
|
|
#include "bolt/Passes/DataflowInfoManager.h"
|
2020-12-02 08:29:39 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
#define DEBUG_TYPE "ICP"
|
2021-12-15 08:52:51 +08:00
|
|
|
#define DEBUG_VERBOSE(Level, X) \
|
|
|
|
if (opts::Verbosity >= (Level)) { \
|
|
|
|
X; \
|
|
|
|
}
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
using namespace bolt;
|
|
|
|
|
|
|
|
namespace opts {
|
|
|
|
|
|
|
|
extern cl::OptionCategory BoltOptCategory;
|
|
|
|
|
2021-10-09 02:47:10 +08:00
|
|
|
extern cl::opt<IndirectCallPromotionType> IndirectCallPromotion;
|
2017-03-09 11:58:33 +08:00
|
|
|
extern cl::opt<unsigned> Verbosity;
|
2020-07-28 09:07:18 +08:00
|
|
|
extern cl::opt<unsigned> ExecutionCountThreshold;
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
"icp-jt-remaining-percent-threshold",
|
|
|
|
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
|
|
|
"call count for the promotion for jump tables"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> ICPJTTotalPercentThreshold(
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
"icp-jt-total-percent-threshold",
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::desc(
|
|
|
|
"The percentage threshold against total count for the promotion for "
|
|
|
|
"jump tables"),
|
|
|
|
cl::init(5), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
"icp-calls-remaining-percent-threshold",
|
|
|
|
cl::desc("The percentage threshold against remaining unpromoted indirect "
|
|
|
|
"call count for the promotion for calls"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(50), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
"icp-calls-total-percent-threshold",
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::desc(
|
|
|
|
"The percentage threshold against total count for the promotion for "
|
|
|
|
"calls"),
|
|
|
|
cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
|
|
|
static cl::opt<unsigned> IndirectCallPromotionMispredictThreshold(
|
2017-03-09 11:58:33 +08:00
|
|
|
"indirect-call-promotion-mispredict-threshold",
|
|
|
|
cl::desc("misprediction threshold for skipping ICP on an "
|
|
|
|
"indirect call"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(0), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<bool> IndirectCallPromotionUseMispredicts(
|
2017-03-09 11:58:33 +08:00
|
|
|
"indirect-call-promotion-use-mispredicts",
|
|
|
|
cl::desc("use misprediction frequency for determining whether or not ICP "
|
|
|
|
"should be applied at a callsite. The "
|
|
|
|
"-indirect-call-promotion-mispredict-threshold value will be used "
|
|
|
|
"by this heuristic"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> IndirectCallPromotionTopN(
|
2017-03-09 11:58:33 +08:00
|
|
|
"indirect-call-promotion-topn",
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
|
|
"call promotion. 0 = no limit"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(3), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> IndirectCallPromotionCallsTopN(
|
2017-10-21 03:11:34 +08:00
|
|
|
"indirect-call-promotion-calls-topn",
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
|
|
"call promotion on calls. 0 = no limit"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(0), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> IndirectCallPromotionJumpTablesTopN(
|
2017-10-21 03:11:34 +08:00
|
|
|
"indirect-call-promotion-jump-tables-topn",
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
cl::desc("limit number of targets to consider when doing indirect "
|
|
|
|
"call promotion on jump tables. 0 = no limit"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(0), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<bool> EliminateLoads(
|
2017-10-21 03:11:34 +08:00
|
|
|
"icp-eliminate-loads",
|
|
|
|
cl::desc("enable load elimination using memory profiling data when "
|
|
|
|
"performing ICP"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
static cl::opt<unsigned> ICPTopCallsites(
|
2017-10-21 03:11:34 +08:00
|
|
|
"icp-top-callsites",
|
2019-12-14 08:46:00 +08:00
|
|
|
cl::desc("optimize hottest calls until at least this percentage of all "
|
|
|
|
"indirect calls frequency is covered. 0 = all callsites"),
|
2021-12-15 08:52:51 +08:00
|
|
|
cl::init(99), cl::Hidden, cl::ZeroOrMore, cl::cat(BoltOptCategory));
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
static cl::list<std::string>
|
2021-12-15 08:52:51 +08:00
|
|
|
ICPFuncsList("icp-funcs", cl::CommaSeparated,
|
|
|
|
cl::desc("list of functions to enable ICP for"),
|
|
|
|
cl::value_desc("func1,func2,func3,..."), cl::Hidden,
|
|
|
|
cl::cat(BoltOptCategory));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
2021-12-15 08:52:51 +08:00
|
|
|
ICPOldCodeSequence("icp-old-code-sequence",
|
|
|
|
cl::desc("use old code sequence for promoted calls"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::Hidden,
|
|
|
|
cl::cat(BoltOptCategory));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
static cl::opt<bool> ICPJumpTablesByTarget(
|
|
|
|
"icp-jump-tables-targets",
|
|
|
|
cl::desc(
|
|
|
|
"for jump tables, optimize indirect jmp targets instead of indices"),
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
} // namespace opts
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
namespace bolt {
|
|
|
|
|
2018-08-31 04:21:50 +08:00
|
|
|
namespace {
|
|
|
|
|
|
|
|
bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
|
|
|
|
bool IsValid = true;
|
|
|
|
for (auto &BFI : BFs) {
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryFunction &BF = BFI.second;
|
2021-12-15 08:52:51 +08:00
|
|
|
if (!BF.isSimple())
|
|
|
|
continue;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *BB : BF.layout()) {
|
2018-08-31 04:21:50 +08:00
|
|
|
auto BI = BB->branch_info_begin();
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock *SuccBB : BB->successors()) {
|
2018-08-31 04:21:50 +08:00
|
|
|
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
|
|
|
|
if (BB->getKnownExecutionCount() == 0 ||
|
|
|
|
SuccBB->getKnownExecutionCount() == 0) {
|
|
|
|
errs() << "BOLT-WARNING: profile verification failed after ICP for "
|
2021-12-15 08:52:51 +08:00
|
|
|
"function "
|
|
|
|
<< BF << '\n';
|
2018-08-31 04:21:50 +08:00
|
|
|
IsValid = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++BI;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return IsValid;
|
|
|
|
}
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
} // namespace
|
2018-08-31 04:21:50 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
|
2017-12-14 15:12:01 +08:00
|
|
|
const IndirectCallProfile &ICP)
|
2021-12-15 08:52:51 +08:00
|
|
|
: From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
|
|
|
|
Branches(ICP.Count) {
|
2020-05-15 08:34:20 +08:00
|
|
|
if (ICP.Symbol) {
|
|
|
|
To.Sym = ICP.Symbol;
|
|
|
|
To.Addr = 0;
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
void IndirectCallPromotion::printDecision(
|
|
|
|
llvm::raw_ostream &OS,
|
|
|
|
std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
|
|
|
|
uint64_t TotalCount = 0;
|
|
|
|
uint64_t TotalMispreds = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const Callsite &S : Targets) {
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
TotalCount += S.Branches;
|
|
|
|
TotalMispreds += S.Mispreds;
|
|
|
|
}
|
|
|
|
if (!TotalCount)
|
|
|
|
TotalCount = 1;
|
|
|
|
if (!TotalMispreds)
|
|
|
|
TotalMispreds = 1;
|
|
|
|
|
|
|
|
OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
|
|
|
|
<< " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
|
|
|
|
<< "\n";
|
|
|
|
|
|
|
|
size_t I = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const Callsite &S : Targets) {
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
OS << "Count = " << S.Branches << ", "
|
|
|
|
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
|
|
|
|
<< "Mispreds = " << S.Mispreds << ", "
|
|
|
|
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
|
|
|
|
if (I < N)
|
|
|
|
OS << " * to be optimized *";
|
|
|
|
if (!S.JTIndices.empty()) {
|
|
|
|
OS << " Indices:";
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const uint64_t Idx : S.JTIndices)
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
OS << " " << Idx;
|
|
|
|
}
|
|
|
|
OS << "\n";
|
|
|
|
I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Get list of targets for a given call sorted by most frequently
|
|
|
|
// called first.
|
|
|
|
std::vector<IndirectCallPromotion::Callsite>
|
2021-10-26 15:06:34 +08:00
|
|
|
IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
|
|
|
|
const MCInst &Inst) const {
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryFunction &BF = *BB.getFunction();
|
|
|
|
BinaryContext &BC = BF.getBinaryContext();
|
2017-03-09 11:58:33 +08:00
|
|
|
std::vector<Callsite> Targets;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
if (const JumpTable *JT = BF.getJumpTable(Inst)) {
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
// Don't support PIC jump tables for now
|
2017-11-15 12:05:11 +08:00
|
|
|
if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
return Targets;
|
2017-03-09 11:58:33 +08:00
|
|
|
const Location From(BF.getSymbol());
|
2021-04-08 15:19:26 +08:00
|
|
|
const std::pair<size_t, size_t> Range =
|
|
|
|
JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
|
2017-03-09 11:58:33 +08:00
|
|
|
assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
|
2017-11-15 12:05:11 +08:00
|
|
|
JumpTable::JumpInfo DefaultJI;
|
2021-04-08 15:19:26 +08:00
|
|
|
const JumpTable::JumpInfo *JI =
|
|
|
|
JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
|
2017-03-09 11:58:33 +08:00
|
|
|
const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
|
2017-11-15 12:05:11 +08:00
|
|
|
assert(JT->Type == JumpTable::JTT_PIC ||
|
[BOLT rebase] Rebase fixes on top of LLVM Feb2018
Summary:
This commit includes all code necessary to make BOLT working again
after the rebase. This includes a redesign of the EHFrame work,
cherry-pick of the 3dnow disassembly work, compilation error fixes,
and port of the debug_info work. The macroop fusion feature is not
ported yet.
The rebased version has minor changes to the "executed instructions"
dynostats counter because REP prefixes are considered a part of the
instruction it applies to. Also, some X86 instructions had the "mayLoad"
tablegen property removed, which BOLT uses to identify and account
for loads, thus reducing the total number of loads reported by
dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are
not terminators anymore, changing our CFG. This commit adds compensation
to preserve this old behavior and minimize tests changes. debug_info
sections are now slightly larger. The discriminator field in the line
table is slightly different due to a change upstream. New profiles
generated with the other bolt are incompatible with this version
because of different hash values calculated for functions, so they will
be considered 100% stale. This commit changes the corresponding test
to XFAIL so it can be updated. The hash function changes because it
relies on raw opcode values, which change according to the opcodes
described in the X86 tablegen files. When processing HHVM, bolt was
observed to be using about 800MB more memory in the rebased version
and being about 5% slower.
(cherry picked from FBD7078072)
2018-02-07 07:00:23 +08:00
|
|
|
JT->EntrySize == BC.AsmInfo->getCodePointerSize());
|
2017-03-09 11:58:33 +08:00
|
|
|
for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
|
2021-04-08 15:19:26 +08:00
|
|
|
MCSymbol *Entry = JT->Entries[I];
|
2017-03-09 11:58:33 +08:00
|
|
|
assert(BF.getBasicBlockForLabel(Entry) ||
|
|
|
|
Entry == BF.getFunctionEndLabel() ||
|
|
|
|
Entry == BF.getFunctionColdEndLabel());
|
2017-11-30 09:38:39 +08:00
|
|
|
if (Entry == BF.getFunctionEndLabel() ||
|
|
|
|
Entry == BF.getFunctionColdEndLabel())
|
|
|
|
continue;
|
2017-03-09 11:58:33 +08:00
|
|
|
const Location To(Entry);
|
2021-04-08 15:19:26 +08:00
|
|
|
const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry);
|
2021-12-15 08:52:51 +08:00
|
|
|
Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count,
|
|
|
|
I - Range.first);
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Sort by symbol then addr.
|
|
|
|
std::sort(Targets.begin(), Targets.end(),
|
|
|
|
[](const Callsite &A, const Callsite &B) {
|
2017-12-14 15:12:01 +08:00
|
|
|
if (A.To.Sym && B.To.Sym)
|
2017-03-09 11:58:33 +08:00
|
|
|
return A.To.Sym < B.To.Sym;
|
2017-12-14 15:12:01 +08:00
|
|
|
else if (A.To.Sym && !B.To.Sym)
|
2017-03-09 11:58:33 +08:00
|
|
|
return true;
|
2017-12-14 15:12:01 +08:00
|
|
|
else if (!A.To.Sym && B.To.Sym)
|
2017-03-09 11:58:33 +08:00
|
|
|
return false;
|
|
|
|
else
|
|
|
|
return A.To.Addr < B.To.Addr;
|
|
|
|
});
|
|
|
|
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
// Targets may contain multiple entries to the same target, but using
|
|
|
|
// different indices. Their profile will report the same number of branches
|
|
|
|
// for different indices if the target is the same. That's because we don't
|
|
|
|
// profile the index value, but only the target via LBR.
|
2017-03-09 11:58:33 +08:00
|
|
|
auto First = Targets.begin();
|
|
|
|
auto Last = Targets.end();
|
|
|
|
auto Result = First;
|
|
|
|
while (++First != Last) {
|
2021-04-08 15:19:26 +08:00
|
|
|
Callsite &A = *Result;
|
|
|
|
const Callsite &B = *First;
|
2021-12-29 08:36:17 +08:00
|
|
|
if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
|
|
|
|
B.JTIndices.end());
|
2021-12-29 08:36:17 +08:00
|
|
|
else
|
2017-03-09 11:58:33 +08:00
|
|
|
*(++Result) = *First;
|
|
|
|
}
|
|
|
|
++Result;
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG(if (Targets.end() - Result > 0) {
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
|
|
|
|
<< " duplicate targets removed\n";
|
|
|
|
});
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
Targets.erase(Result, Targets.end());
|
|
|
|
} else {
|
2017-10-21 03:11:34 +08:00
|
|
|
// Don't try to optimize PC relative indirect calls.
|
|
|
|
if (Inst.getOperand(0).isReg() &&
|
2021-12-29 08:36:17 +08:00
|
|
|
Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter())
|
2017-10-21 03:11:34 +08:00
|
|
|
return Targets;
|
2021-12-29 08:36:17 +08:00
|
|
|
|
2021-04-09 14:31:12 +08:00
|
|
|
const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
|
2021-12-15 08:52:51 +08:00
|
|
|
Inst, "CallProfile");
|
2017-12-14 15:12:01 +08:00
|
|
|
if (ICSP) {
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const IndirectCallProfile &CSP : ICSP.get()) {
|
2017-12-14 15:12:01 +08:00
|
|
|
Callsite Site(BF, CSP);
|
|
|
|
if (Site.isValid())
|
|
|
|
Targets.emplace_back(std::move(Site));
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-20 12:20:55 +08:00
|
|
|
// Sort by target count, number of indices in case of jump table, and
|
|
|
|
// mispredicts. We prioritize targets with high count, small number of indices
|
|
|
|
// and high mispredicts. Break ties by selecting targets with lower addresses.
|
2018-08-31 04:21:50 +08:00
|
|
|
std::stable_sort(Targets.begin(), Targets.end(),
|
|
|
|
[](const Callsite &A, const Callsite &B) {
|
2019-07-25 08:54:14 +08:00
|
|
|
if (A.Branches != B.Branches)
|
2019-07-03 06:51:20 +08:00
|
|
|
return A.Branches > B.Branches;
|
2022-01-20 12:20:55 +08:00
|
|
|
if (A.JTIndices.size() != B.JTIndices.size())
|
2019-07-03 06:51:20 +08:00
|
|
|
return A.JTIndices.size() < B.JTIndices.size();
|
2022-01-20 12:20:55 +08:00
|
|
|
if (A.Mispreds != B.Mispreds)
|
2019-07-03 06:51:20 +08:00
|
|
|
return A.Mispreds > B.Mispreds;
|
2022-01-20 12:20:55 +08:00
|
|
|
return A.To.Addr < B.To.Addr;
|
2018-08-31 04:21:50 +08:00
|
|
|
});
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Remove non-symbol targets
|
2021-12-15 08:52:51 +08:00
|
|
|
auto Last = std::remove_if(Targets.begin(), Targets.end(),
|
|
|
|
[](const Callsite &CS) { return !CS.To.Sym; });
|
2017-03-09 11:58:33 +08:00
|
|
|
Targets.erase(Last, Targets.end());
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
|
|
|
|
uint64_t TotalCount = 0;
|
|
|
|
uint64_t TotalMispreds = 0;
|
|
|
|
for (const Callsite &S : Targets) {
|
|
|
|
TotalCount += S.Branches;
|
|
|
|
TotalMispreds += S.Mispreds;
|
|
|
|
}
|
|
|
|
if (!TotalCount)
|
|
|
|
TotalCount = 1;
|
|
|
|
if (!TotalMispreds)
|
|
|
|
TotalMispreds = 1;
|
|
|
|
|
|
|
|
dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
|
|
|
|
<< ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
|
|
|
|
<< "\n";
|
|
|
|
|
|
|
|
size_t I = 0;
|
|
|
|
for (const Callsite &S : Targets) {
|
|
|
|
dbgs() << "Count[" << I << "] = " << S.Branches << ", "
|
|
|
|
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
|
|
|
|
<< "Mispreds[" << I << "] = " << S.Mispreds << ", "
|
|
|
|
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
});
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
return Targets;
|
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
IndirectCallPromotion::JumpTableInfoType
|
2021-10-26 15:06:34 +08:00
|
|
|
IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
|
|
|
|
MCInst &CallInst,
|
|
|
|
MCInst *&TargetFetchInst,
|
|
|
|
const JumpTable *JT) const {
|
2017-10-21 03:11:34 +08:00
|
|
|
assert(JT && "Can't get jump table addrs for non-jump tables.");
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
BinaryFunction &Function = *BB.getFunction();
|
|
|
|
BinaryContext &BC = Function.getBinaryContext();
|
|
|
|
|
2020-05-08 14:00:29 +08:00
|
|
|
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
|
2017-10-21 03:11:34 +08:00
|
|
|
return JumpTableInfoType();
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
JumpTableInfoType HotTargets;
|
2017-10-21 03:11:34 +08:00
|
|
|
MCInst *MemLocInstr;
|
|
|
|
MCInst *PCRelBaseOut;
|
|
|
|
unsigned BaseReg, IndexReg;
|
|
|
|
int64_t DispValue;
|
|
|
|
const MCExpr *DispExpr;
|
2021-10-26 15:06:34 +08:00
|
|
|
MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
|
2021-04-08 15:19:26 +08:00
|
|
|
const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
|
[BOLT rebase] Rebase fixes on top of LLVM Feb2018
Summary:
This commit includes all code necessary to make BOLT working again
after the rebase. This includes a redesign of the EHFrame work,
cherry-pick of the 3dnow disassembly work, compilation error fixes,
and port of the debug_info work. The macroop fusion feature is not
ported yet.
The rebased version has minor changes to the "executed instructions"
dynostats counter because REP prefixes are considered a part of the
instruction it applies to. Also, some X86 instructions had the "mayLoad"
tablegen property removed, which BOLT uses to identify and account
for loads, thus reducing the total number of loads reported by
dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are
not terminators anymore, changing our CFG. This commit adds compensation
to preserve this old behavior and minimize tests changes. debug_info
sections are now slightly larger. The discriminator field in the line
table is slightly different due to a change upstream. New profiles
generated with the other bolt are incompatible with this version
because of different hash values calculated for functions, so they will
be considered 100% stale. This commit changes the corresponding test
to XFAIL so it can be updated. The hash function changes because it
relies on raw opcode values, which change according to the opcodes
described in the X86 tablegen files. When processing HHVM, bolt was
observed to be using about 800MB more memory in the rebased version
and being about 5% slower.
(cherry picked from FBD7078072)
2018-02-07 07:00:23 +08:00
|
|
|
CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
|
|
|
|
MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
assert(MemLocInstr && "There should always be a load for jump tables");
|
|
|
|
if (!MemLocInstr)
|
|
|
|
return JumpTableInfoType();
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG({
|
2021-10-26 15:06:34 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
|
|
|
|
<< "jump table in " << Function << " at @ "
|
|
|
|
<< (&CallInst - &BB.front()) << "\n"
|
|
|
|
<< "BOLT-INFO: ICP target fetch instructions:\n";
|
|
|
|
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
|
2021-12-29 08:36:17 +08:00
|
|
|
if (MemLocInstr != &CallInst)
|
2021-10-26 15:06:34 +08:00
|
|
|
BC.printInstruction(dbgs(), CallInst, 0, &Function);
|
|
|
|
});
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
DEBUG_VERBOSE(1, {
|
2021-12-15 08:52:51 +08:00
|
|
|
dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
|
|
|
|
<< "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
|
|
|
|
<< "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
|
|
|
|
<< "DispValue = " << Twine::utohexstr(DispValue) << ", "
|
|
|
|
<< "DispExpr = " << DispExpr << ", "
|
|
|
|
<< "MemLocInstr = ";
|
|
|
|
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
|
|
|
|
dbgs() << "\n";
|
|
|
|
});
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
++TotalIndexBasedCandidates;
|
|
|
|
|
2020-05-08 14:00:29 +08:00
|
|
|
auto ErrorOrMemAccesssProfile =
|
2021-12-15 08:52:51 +08:00
|
|
|
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
|
|
|
|
"MemoryAccessProfile");
|
2020-05-08 14:00:29 +08:00
|
|
|
if (!ErrorOrMemAccesssProfile) {
|
2021-12-15 08:52:51 +08:00
|
|
|
DEBUG_VERBOSE(1, dbgs()
|
|
|
|
<< "BOLT-INFO: ICP no memory profiling data found\n");
|
2017-10-21 03:11:34 +08:00
|
|
|
return JumpTableInfoType();
|
|
|
|
}
|
2021-04-08 15:19:26 +08:00
|
|
|
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
uint64_t ArrayStart;
|
|
|
|
if (DispExpr) {
|
2021-04-08 15:19:26 +08:00
|
|
|
ErrorOr<uint64_t> DispValueOrError =
|
2020-12-02 08:29:39 +08:00
|
|
|
BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
|
2019-06-05 06:30:22 +08:00
|
|
|
assert(DispValueOrError && "global symbol needs a value");
|
|
|
|
ArrayStart = *DispValueOrError;
|
2017-10-21 03:11:34 +08:00
|
|
|
} else {
|
|
|
|
ArrayStart = static_cast<uint64_t>(DispValue);
|
|
|
|
}
|
|
|
|
|
2021-12-29 08:36:17 +08:00
|
|
|
if (BaseReg == BC.MRI->getProgramCounter())
|
2020-05-08 14:00:29 +08:00
|
|
|
ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2017-11-30 09:38:39 +08:00
|
|
|
// This is a map of [symbol] -> [count, index] and is used to combine indices
|
|
|
|
// into the jump table since there may be multiple addresses that all have the
|
|
|
|
// same entry.
|
|
|
|
std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
|
2021-04-08 15:19:26 +08:00
|
|
|
const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
|
2017-11-30 09:38:39 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
|
2017-10-21 03:11:34 +08:00
|
|
|
size_t Index;
|
2020-05-08 14:00:29 +08:00
|
|
|
// Mem data occasionally includes nullprs, ignore them.
|
|
|
|
if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
|
2017-10-21 03:11:34 +08:00
|
|
|
continue;
|
|
|
|
|
2020-05-08 14:00:29 +08:00
|
|
|
if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
|
2017-11-30 09:38:39 +08:00
|
|
|
return JumpTableInfoType();
|
|
|
|
|
2020-05-08 14:00:29 +08:00
|
|
|
if (AccessInfo.MemoryObject) {
|
2017-10-21 03:11:34 +08:00
|
|
|
// Deal with bad/stale data
|
2021-12-15 08:52:51 +08:00
|
|
|
if (!AccessInfo.MemoryObject->getName().startswith(
|
|
|
|
"JUMP_TABLE/" + Function.getOneName().str()))
|
2017-11-30 09:38:39 +08:00
|
|
|
return JumpTableInfoType();
|
2020-05-08 14:00:29 +08:00
|
|
|
Index =
|
2021-12-15 08:52:51 +08:00
|
|
|
(AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
|
2017-10-21 03:11:34 +08:00
|
|
|
} else {
|
2020-05-08 14:00:29 +08:00
|
|
|
Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// If Index is out of range it probably means the memory profiling data is
|
|
|
|
// wrong for this instruction, bail out.
|
2017-11-30 09:38:39 +08:00
|
|
|
if (Index >= Range.second) {
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
|
|
|
|
<< ", " << Range.second << "\n");
|
2017-11-30 09:38:39 +08:00
|
|
|
return JumpTableInfoType();
|
|
|
|
}
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2017-11-30 09:38:39 +08:00
|
|
|
// Make sure the hot index points at a legal label corresponding to a BB,
|
|
|
|
// e.g. not the end of function (unreachable) label.
|
|
|
|
if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG({
|
2021-12-15 08:52:51 +08:00
|
|
|
dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
|
|
|
|
<< "label " << JT->Entries[Index + Range.first]->getName()
|
|
|
|
<< " in jump table:\n";
|
|
|
|
JT->print(dbgs());
|
|
|
|
dbgs() << "HotTargetMap:\n";
|
|
|
|
for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
|
2021-12-29 08:36:17 +08:00
|
|
|
HotTargetMap)
|
2021-12-15 08:52:51 +08:00
|
|
|
dbgs() << "BOLT-INFO: " << HT.first->getName()
|
2021-12-29 08:37:53 +08:00
|
|
|
<< " = (count=" << HT.second.first
|
|
|
|
<< ", index=" << HT.second.second << ")\n";
|
2021-12-15 08:52:51 +08:00
|
|
|
});
|
2017-11-30 09:38:39 +08:00
|
|
|
return JumpTableInfoType();
|
|
|
|
}
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
std::pair<uint64_t, uint64_t> &HotTarget =
|
|
|
|
HotTargetMap[JT->Entries[Index + Range.first]];
|
2020-05-08 14:00:29 +08:00
|
|
|
HotTarget.first += AccessInfo.Count;
|
2017-11-30 09:38:39 +08:00
|
|
|
HotTarget.second = Index;
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
|
2017-11-30 09:38:39 +08:00
|
|
|
std::transform(
|
2021-12-15 08:52:51 +08:00
|
|
|
HotTargetMap.begin(), HotTargetMap.end(), std::back_inserter(HotTargets),
|
|
|
|
[](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) {
|
|
|
|
return A.second;
|
|
|
|
});
|
2017-11-30 09:38:39 +08:00
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
// Sort with highest counts first.
|
|
|
|
std::sort(HotTargets.rbegin(), HotTargets.rend());
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG({
|
2021-12-15 08:52:51 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
|
2021-12-29 08:36:17 +08:00
|
|
|
for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
|
2021-12-15 08:52:51 +08:00
|
|
|
dbgs() << "BOLT-INFO: Idx = " << Target.second << ", "
|
|
|
|
<< "Count = " << Target.first << "\n";
|
|
|
|
});
|
2017-10-21 03:11:34 +08:00
|
|
|
|
[BOLT][Refactoring] Isolate changes to MC layer
Summary:
Changes that we made to MCInst, MCOperand, MCExpr, etc. are now all
moved into tools/llvm-bolt. That required a change to the way we handle
annotations and any extra operands for MCInst.
Any MCPlus information is now attached via an extra operand of type
MCInst with an opcode ANNOTATION_LABEL. Since this operand is MCInst, we
attach extra info as operands to this instruction. For first-level
annotations use functions to access the information, such as
getConditionalTailCall() or getEHInfo(), etc. For the rest, optional or
second-class annotations, use a general named-annotation interface such
as getAnnotationAs<uint64_t>(Inst, "Count").
I did a test on HHVM binary, and a memory consumption went down a little
bit while the runtime remained the same.
(cherry picked from FBD7405412)
2018-03-20 09:32:12 +08:00
|
|
|
BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
TargetFetchInst = MemLocInstr;
|
|
|
|
|
|
|
|
return HotTargets;
|
|
|
|
}
|
|
|
|
|
|
|
|
IndirectCallPromotion::SymTargetsType
|
2021-10-26 15:06:34 +08:00
|
|
|
IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
|
|
|
|
size_t &N, BinaryBasicBlock &BB,
|
|
|
|
MCInst &CallInst,
|
|
|
|
MCInst *&TargetFetchInst) const {
|
|
|
|
const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
|
2017-10-21 03:11:34 +08:00
|
|
|
SymTargetsType SymTargets;
|
|
|
|
|
|
|
|
if (JT) {
|
2021-10-26 15:06:34 +08:00
|
|
|
JumpTableInfoType HotTargets =
|
|
|
|
maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
if (!HotTargets.empty()) {
|
2017-11-30 09:38:39 +08:00
|
|
|
auto findTargetsIndex = [&](uint64_t JTIndex) {
|
|
|
|
for (size_t I = 0; I < Targets.size(); ++I) {
|
2021-04-08 15:19:26 +08:00
|
|
|
std::vector<uint64_t> &JTIs = Targets[I].JTIndices;
|
2017-11-30 09:38:39 +08:00
|
|
|
if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
|
|
|
|
return I;
|
|
|
|
}
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
|
2021-10-26 15:06:34 +08:00
|
|
|
<< " table entry in " << *BB.getFunction() << "\n");
|
2017-11-30 09:38:39 +08:00
|
|
|
llvm_unreachable("Hot indices must be referred to by at least one "
|
|
|
|
"callsite");
|
|
|
|
};
|
|
|
|
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
|
|
|
for (size_t I = 0; I < HotTargets.size(); ++I)
|
2017-11-30 09:38:39 +08:00
|
|
|
outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
|
2021-12-15 08:52:51 +08:00
|
|
|
<< HotTargets[I].first << ", " << HotTargets[I].second
|
|
|
|
<< ")\n";
|
2017-11-30 09:38:39 +08:00
|
|
|
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
// Recompute hottest targets, now discriminating which index is hot
|
|
|
|
// NOTE: This is a tradeoff. On one hand, we get index information. On the
|
|
|
|
// other hand, info coming from the memory profile is much less accurate
|
|
|
|
// than LBRs. So we may actually end up working with more coarse
|
|
|
|
// profile granularity in exchange for information about indices.
|
2017-11-30 09:38:39 +08:00
|
|
|
std::vector<Callsite> NewTargets;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
|
|
|
|
uint64_t TotalMemAccesses = 0;
|
|
|
|
for (size_t I = 0; I < HotTargets.size(); ++I) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
++IndicesPerTarget[Targets[TargetIndex].To.Sym];
|
|
|
|
TotalMemAccesses += HotTargets[I].first;
|
|
|
|
}
|
|
|
|
uint64_t RemainingMemAccesses = TotalMemAccesses;
|
|
|
|
const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
|
|
|
|
? opts::IndirectCallPromotionTopN
|
|
|
|
: opts::IndirectCallPromotionTopN;
|
2021-05-14 01:50:47 +08:00
|
|
|
size_t I = 0;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
for (; I < HotTargets.size(); ++I) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t MemAccesses = HotTargets[I].first;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
if (100 * MemAccesses <
|
|
|
|
TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
|
|
|
|
break;
|
|
|
|
if (100 * MemAccesses <
|
|
|
|
RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
|
|
|
|
break;
|
|
|
|
if (TopN && I >= TopN)
|
|
|
|
break;
|
|
|
|
RemainingMemAccesses -= MemAccesses;
|
2017-11-30 09:38:39 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t JTIndex = HotTargets[I].second;
|
|
|
|
Callsite &Target = Targets[findTargetsIndex(JTIndex)];
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
|
|
|
|
NewTargets.push_back(Target);
|
|
|
|
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
|
|
|
|
Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
|
|
|
|
Target.JTIndices.end(), JTIndex),
|
|
|
|
Target.JTIndices.end());
|
|
|
|
|
|
|
|
// Keep fixCFG counts sane if more indices use this same target later
|
|
|
|
assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
|
|
|
|
NewTargets.back().Branches =
|
|
|
|
Target.Branches / IndicesPerTarget[Target.To.Sym];
|
|
|
|
NewTargets.back().Mispreds =
|
|
|
|
Target.Mispreds / IndicesPerTarget[Target.To.Sym];
|
|
|
|
assert(Target.Branches >= NewTargets.back().Branches);
|
|
|
|
assert(Target.Mispreds >= NewTargets.back().Mispreds);
|
|
|
|
Target.Branches -= NewTargets.back().Branches;
|
|
|
|
Target.Mispreds -= NewTargets.back().Mispreds;
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
|
2017-10-21 03:11:34 +08:00
|
|
|
std::swap(NewTargets, Targets);
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
N = I;
|
|
|
|
|
|
|
|
if (N == 0 && opts::Verbosity >= 1) {
|
2021-10-26 15:06:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
|
|
|
|
<< BB.getName()
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
<< ": failed to meet thresholds after memory profile data was "
|
|
|
|
"loaded.\n";
|
|
|
|
return SymTargets;
|
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
|
2021-04-08 15:19:26 +08:00
|
|
|
Callsite &Target = Targets[TgtIdx];
|
2017-12-14 15:12:01 +08:00
|
|
|
assert(Target.To.Sym && "All ICP targets must be to known symbols");
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
assert(!Target.JTIndices.empty() && "Jump tables must have indices");
|
2021-04-08 15:19:26 +08:00
|
|
|
for (uint64_t Idx : Target.JTIndices) {
|
2021-05-08 09:43:25 +08:00
|
|
|
SymTargets.emplace_back(Target.To.Sym, Idx);
|
2017-11-30 09:38:39 +08:00
|
|
|
++I;
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
}
|
|
|
|
}
|
2017-10-21 03:11:34 +08:00
|
|
|
} else {
|
|
|
|
for (size_t I = 0; I < N; ++I) {
|
2021-12-15 08:52:51 +08:00
|
|
|
assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
assert(Targets[I].JTIndices.empty() &&
|
2017-10-21 03:11:34 +08:00
|
|
|
"Can't have jump table indices for non-jump tables");
|
2021-05-08 09:43:25 +08:00
|
|
|
SymTargets.emplace_back(Targets[I].To.Sym, 0);
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return SymTargets;
|
|
|
|
}
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
|
|
|
|
BinaryBasicBlock &BB, MCInst &Inst,
|
|
|
|
const SymTargetsType &SymTargets) const {
|
|
|
|
BinaryFunction &Function = *BB.getFunction();
|
|
|
|
BinaryContext &BC = Function.getBinaryContext();
|
2018-04-21 11:03:31 +08:00
|
|
|
std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
|
2017-10-21 03:11:34 +08:00
|
|
|
std::vector<MCInst *> MethodFetchInsns;
|
|
|
|
unsigned VtableReg, MethodReg;
|
|
|
|
uint64_t MethodOffset;
|
|
|
|
|
|
|
|
assert(!Function.getJumpTable(Inst) &&
|
|
|
|
"Can't get vtable addrs for jump tables.");
|
|
|
|
|
2020-05-08 14:00:29 +08:00
|
|
|
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
|
2017-10-21 03:11:34 +08:00
|
|
|
return MethodInfoType();
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
|
2021-12-15 08:52:51 +08:00
|
|
|
if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(),
|
|
|
|
MethodFetchInsns, VtableReg, MethodReg,
|
2017-10-21 03:11:34 +08:00
|
|
|
MethodOffset)) {
|
2021-10-26 15:06:34 +08:00
|
|
|
DEBUG_VERBOSE(
|
|
|
|
1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
|
|
|
|
<< Function << " at @ " << (&Inst - &BB.front()) << "\n");
|
2017-10-21 03:11:34 +08:00
|
|
|
return MethodInfoType();
|
|
|
|
}
|
|
|
|
|
|
|
|
++TotalMethodLoadEliminationCandidates;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
DEBUG_VERBOSE(1, {
|
2021-10-26 15:06:34 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
|
|
|
|
<< " at @ " << (&Inst - &BB.front()) << "\n";
|
2017-10-21 03:11:34 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
|
2021-12-29 08:36:17 +08:00
|
|
|
for (MCInst *Inst : MethodFetchInsns)
|
2017-10-21 03:11:34 +08:00
|
|
|
BC.printInstruction(dbgs(), *Inst, 0, &Function);
|
2021-12-29 08:36:17 +08:00
|
|
|
|
|
|
|
if (MethodFetchInsns.back() != &Inst)
|
2017-10-21 03:11:34 +08:00
|
|
|
BC.printInstruction(dbgs(), Inst, 0, &Function);
|
2021-04-08 15:19:26 +08:00
|
|
|
});
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
// Try to get value profiling data for the method load instruction.
|
2020-05-08 14:00:29 +08:00
|
|
|
auto ErrorOrMemAccesssProfile =
|
2021-12-15 08:52:51 +08:00
|
|
|
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
|
|
|
|
"MemoryAccessProfile");
|
2020-05-08 14:00:29 +08:00
|
|
|
if (!ErrorOrMemAccesssProfile) {
|
2021-12-15 08:52:51 +08:00
|
|
|
DEBUG_VERBOSE(1, dbgs()
|
|
|
|
<< "BOLT-INFO: ICP no memory profiling data found\n");
|
2017-10-21 03:11:34 +08:00
|
|
|
return MethodInfoType();
|
|
|
|
}
|
2021-04-08 15:19:26 +08:00
|
|
|
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
// Find the vtable that each method belongs to.
|
|
|
|
std::map<const MCSymbol *, uint64_t> MethodToVtable;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
|
2020-05-08 14:00:29 +08:00
|
|
|
uint64_t Address = AccessInfo.Offset;
|
2021-12-29 08:36:17 +08:00
|
|
|
if (AccessInfo.MemoryObject)
|
2020-05-08 14:00:29 +08:00
|
|
|
Address += AccessInfo.MemoryObject->getAddress();
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
// Ignore bogus data.
|
|
|
|
if (!Address)
|
|
|
|
continue;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
const uint64_t VtableBase = Address - MethodOffset;
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< Twine::utohexstr(VtableBase) << "+"
|
|
|
|
<< MethodOffset << "/" << AccessInfo.Count << "\n");
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
|
|
|
|
BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
|
2021-12-15 08:52:51 +08:00
|
|
|
if (!MethodBD) // skip unknown methods
|
2017-11-15 12:05:11 +08:00
|
|
|
continue;
|
2021-04-08 15:19:26 +08:00
|
|
|
MCSymbol *MethodSym = MethodBD->getSymbol();
|
2017-10-21 03:11:34 +08:00
|
|
|
MethodToVtable[MethodSym] = VtableBase;
|
2021-04-08 15:19:26 +08:00
|
|
|
DEBUG_VERBOSE(1, {
|
|
|
|
const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
|
2017-10-21 03:11:34 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP found method = "
|
|
|
|
<< Twine::utohexstr(MethodAddr.get()) << "/"
|
|
|
|
<< (Method ? Method->getPrintName() : "") << "\n";
|
2021-04-08 15:19:26 +08:00
|
|
|
});
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find the vtable for each target symbol.
|
|
|
|
for (size_t I = 0; I < SymTargets.size(); ++I) {
|
|
|
|
auto Itr = MethodToVtable.find(SymTargets[I].first);
|
|
|
|
if (Itr != MethodToVtable.end()) {
|
2021-04-08 15:19:26 +08:00
|
|
|
if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
|
2018-04-21 11:03:31 +08:00
|
|
|
const uint64_t Addend = Itr->second - BD->getAddress();
|
2021-05-08 09:43:25 +08:00
|
|
|
VtableSyms.emplace_back(BD->getSymbol(), Addend);
|
2018-04-21 11:03:31 +08:00
|
|
|
continue;
|
|
|
|
}
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
2018-04-21 11:03:31 +08:00
|
|
|
// Give up if we can't find the vtable for a method.
|
|
|
|
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
|
|
|
|
<< SymTargets[I].first->getName() << "\n");
|
|
|
|
return MethodInfoType();
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure the vtable reg is not clobbered by the argument passing code
|
|
|
|
if (VtableReg != MethodReg) {
|
2021-04-08 15:19:26 +08:00
|
|
|
for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
|
|
|
|
++CurInst) {
|
|
|
|
const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
|
2021-12-29 08:36:17 +08:00
|
|
|
if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI))
|
2017-10-21 03:11:34 +08:00
|
|
|
return MethodInfoType();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-21 11:03:31 +08:00
|
|
|
return MethodInfoType(VtableSyms, MethodFetchInsns);
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
[BOLT rebase] Rebase fixes on top of LLVM Feb2018
Summary:
This commit includes all code necessary to make BOLT working again
after the rebase. This includes a redesign of the EHFrame work,
cherry-pick of the 3dnow disassembly work, compilation error fixes,
and port of the debug_info work. The macroop fusion feature is not
ported yet.
The rebased version has minor changes to the "executed instructions"
dynostats counter because REP prefixes are considered a part of the
instruction it applies to. Also, some X86 instructions had the "mayLoad"
tablegen property removed, which BOLT uses to identify and account
for loads, thus reducing the total number of loads reported by
dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are
not terminators anymore, changing our CFG. This commit adds compensation
to preserve this old behavior and minimize tests changes. debug_info
sections are now slightly larger. The discriminator field in the line
table is slightly different due to a change upstream. New profiles
generated with the other bolt are incompatible with this version
because of different hash values calculated for functions, so they will
be considered 100% stale. This commit changes the corresponding test
to XFAIL so it can be updated. The hash function changes because it
relies on raw opcode values, which change according to the opcodes
described in the X86 tablegen files. When processing HHVM, bolt was
observed to be using about 800MB more memory in the rebased version
and being about 5% slower.
(cherry picked from FBD7078072)
2018-02-07 07:00:23 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>>
|
2017-10-21 03:11:34 +08:00
|
|
|
IndirectCallPromotion::rewriteCall(
|
2021-10-26 15:06:34 +08:00
|
|
|
BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
|
|
|
|
MCPlusBuilder::BlocksVectorTy &&ICPcode,
|
|
|
|
const std::vector<MCInst *> &MethodFetchInsns) const {
|
|
|
|
BinaryFunction &Function = *IndCallBlock.getFunction();
|
|
|
|
MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Create new basic blocks with correct code in each one first.
|
|
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
|
2021-10-26 15:06:34 +08:00
|
|
|
const bool IsTailCallOrJT =
|
|
|
|
(MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Move instructions from the tail of the original call block
|
|
|
|
// to the merge block.
|
|
|
|
|
|
|
|
// Remember any pseudo instructions following a tail call. These
|
|
|
|
// must be preserved and moved to the original block.
|
2021-12-10 03:53:12 +08:00
|
|
|
InstructionListType TailInsts;
|
2021-04-08 15:19:26 +08:00
|
|
|
const MCInst *TailInst = &CallInst;
|
2021-12-29 08:36:17 +08:00
|
|
|
if (IsTailCallOrJT)
|
2021-10-26 15:06:34 +08:00
|
|
|
while (TailInst + 1 < &(*IndCallBlock.end()) &&
|
2021-12-29 08:36:17 +08:00
|
|
|
MIB->isPseudo(*(TailInst + 1)))
|
2017-03-09 11:58:33 +08:00
|
|
|
TailInsts.push_back(*++TailInst);
|
|
|
|
|
2021-12-10 03:53:12 +08:00
|
|
|
InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
|
2019-04-13 08:33:46 +08:00
|
|
|
// Link new BBs to the original input offset of the BB where the indirect
|
|
|
|
// call site is, so we can map samples recorded in new BBs back to the
|
|
|
|
// original BB seen in the input binary (if using BAT)
|
2021-10-26 15:06:34 +08:00
|
|
|
const uint32_t OrigOffset = IndCallBlock.getInputOffset();
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
|
|
|
|
MethodFetchInsns.end());
|
|
|
|
if (IndCallBlock.empty() ||
|
2021-12-29 08:36:17 +08:00
|
|
|
(!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.addInstructions(ICPcode.front().second.begin(),
|
|
|
|
ICPcode.front().second.end());
|
2021-12-29 08:36:17 +08:00
|
|
|
else
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
|
|
|
|
ICPcode.front().second);
|
|
|
|
IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
|
2021-04-08 15:19:26 +08:00
|
|
|
MCSymbol *&Sym = Itr->first;
|
2021-12-10 03:53:12 +08:00
|
|
|
InstructionListType &Insts = Itr->second;
|
2017-03-09 11:58:33 +08:00
|
|
|
assert(Sym);
|
2021-04-08 15:19:26 +08:00
|
|
|
std::unique_ptr<BinaryBasicBlock> TBB =
|
|
|
|
Function.createBasicBlock(OrigOffset, Sym);
|
2021-12-29 08:36:17 +08:00
|
|
|
for (MCInst &Inst : Insts) // sanitize new instructions.
|
2021-10-26 15:06:34 +08:00
|
|
|
if (MIB->isCall(Inst))
|
|
|
|
MIB->removeAnnotation(Inst, "CallProfile");
|
2017-03-09 11:58:33 +08:00
|
|
|
TBB->addInstructions(Insts.begin(), Insts.end());
|
|
|
|
NewBBs.emplace_back(std::move(TBB));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Move tail of instructions from after the original call to
|
|
|
|
// the merge block.
|
2021-12-29 08:36:17 +08:00
|
|
|
if (!IsTailCallOrJT)
|
2017-03-09 11:58:33 +08:00
|
|
|
NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
|
|
|
|
|
|
|
|
return NewBBs;
|
|
|
|
}
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
BinaryBasicBlock *
|
|
|
|
IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
|
|
|
|
const bool IsTailCall, const bool IsJumpTable,
|
|
|
|
IndirectCallPromotion::BasicBlocksVector &&NewBBs,
|
|
|
|
const std::vector<Callsite> &Targets) const {
|
|
|
|
BinaryFunction &Function = *IndCallBlock.getFunction();
|
2017-03-09 11:58:33 +08:00
|
|
|
using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
|
|
|
|
BinaryBasicBlock *MergeBlock = nullptr;
|
|
|
|
|
|
|
|
// Scale indirect call counts to the execution count of the original
|
|
|
|
// basic block containing the indirect call.
|
2021-10-26 15:06:34 +08:00
|
|
|
uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
|
2017-03-09 11:58:33 +08:00
|
|
|
uint64_t TotalIndirectBranches = 0;
|
2021-12-29 08:36:17 +08:00
|
|
|
for (const Callsite &Target : Targets)
|
2018-08-24 13:47:46 +08:00
|
|
|
TotalIndirectBranches += Target.Branches;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
if (TotalIndirectBranches == 0)
|
|
|
|
TotalIndirectBranches = 1;
|
2021-12-10 03:53:12 +08:00
|
|
|
BinaryBasicBlock::BranchInfoType BBI;
|
|
|
|
BinaryBasicBlock::BranchInfoType ScaledBBI;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const Callsite &Target : Targets) {
|
2021-11-12 10:14:53 +08:00
|
|
|
const size_t NumEntries =
|
|
|
|
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
|
2018-08-24 13:47:46 +08:00
|
|
|
for (size_t I = 0; I < NumEntries; ++I) {
|
2018-08-31 04:21:50 +08:00
|
|
|
BBI.push_back(
|
|
|
|
BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
|
2021-12-15 08:52:51 +08:00
|
|
|
(Target.Mispreds + NumEntries - 1) / NumEntries});
|
|
|
|
ScaledBBI.push_back(
|
|
|
|
BinaryBranchInfo{uint64_t(TotalCount * Target.Branches /
|
|
|
|
(NumEntries * TotalIndirectBranches)),
|
|
|
|
uint64_t(TotalCount * Target.Mispreds /
|
|
|
|
(NumEntries * TotalIndirectBranches))});
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
}
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
if (IsJumpTable) {
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
std::vector<MCSymbol *> SymTargets;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const Callsite &Target : Targets) {
|
2021-11-12 10:14:53 +08:00
|
|
|
const size_t NumEntries =
|
|
|
|
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
|
2021-12-29 08:36:17 +08:00
|
|
|
for (size_t I = 0; I < NumEntries; ++I)
|
2018-08-24 13:47:46 +08:00
|
|
|
SymTargets.push_back(Target.To.Sym);
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
assert(SymTargets.size() > NewBBs.size() - 1 &&
|
|
|
|
"There must be a target symbol associated with each new BB.");
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
|
2018-08-24 13:47:46 +08:00
|
|
|
for (uint64_t I = 0; I < NewBBs.size(); ++I) {
|
2021-10-26 15:06:34 +08:00
|
|
|
BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
|
2018-08-24 13:47:46 +08:00
|
|
|
SourceBB->setExecutionCount(TotalCount);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryBasicBlock *TargetBB =
|
|
|
|
Function.getBasicBlockForLabel(SymTargets[I]);
|
2018-08-24 13:47:46 +08:00
|
|
|
SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
|
|
|
|
|
|
|
|
TotalCount -= ScaledBBI[I].Count;
|
|
|
|
SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
|
|
|
|
|
|
|
|
// Update branch info for the indirect jump.
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
|
|
|
|
NewIndCallBlock->getBranchInfo(*TargetBB);
|
2018-08-31 04:21:50 +08:00
|
|
|
if (BranchInfo.Count > BBI[I].Count)
|
|
|
|
BranchInfo.Count -= BBI[I].Count;
|
|
|
|
else
|
|
|
|
BranchInfo.Count = 0;
|
|
|
|
|
|
|
|
if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
|
|
|
|
BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
|
|
|
|
else
|
|
|
|
BranchInfo.MispredictedCount = 0;
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(NewBBs.size() >= 2);
|
2021-10-26 15:06:34 +08:00
|
|
|
assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
|
2017-10-21 03:11:34 +08:00
|
|
|
assert(NewBBs.size() % 2 == 1 || IsTailCall);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2018-08-24 13:47:46 +08:00
|
|
|
auto ScaledBI = ScaledBBI.begin();
|
2021-12-15 08:52:51 +08:00
|
|
|
auto updateCurrentBranchInfo = [&] {
|
2018-08-24 13:47:46 +08:00
|
|
|
assert(ScaledBI != ScaledBBI.end());
|
|
|
|
TotalCount -= ScaledBI->Count;
|
|
|
|
++ScaledBI;
|
|
|
|
};
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
if (!IsTailCall) {
|
|
|
|
MergeBlock = NewBBs.back().get();
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.moveAllSuccessorsTo(MergeBlock);
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Fix up successors and execution counts.
|
|
|
|
updateCurrentBranchInfo();
|
2021-10-26 15:06:34 +08:00
|
|
|
IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
|
|
|
|
IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
const size_t Adj = IsTailCall ? 1 : 2;
|
|
|
|
for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
|
2021-10-26 15:06:34 +08:00
|
|
|
assert(TotalCount <= IndCallBlock.getExecutionCount() ||
|
2017-03-09 11:58:33 +08:00
|
|
|
TotalCount <= uint64_t(TotalIndirectBranches));
|
2021-04-08 15:19:26 +08:00
|
|
|
uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
|
2017-03-09 11:58:33 +08:00
|
|
|
if (I % 2 == 0) {
|
2021-12-29 08:36:17 +08:00
|
|
|
if (MergeBlock)
|
2021-12-15 08:52:51 +08:00
|
|
|
NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count);
|
2017-03-09 11:58:33 +08:00
|
|
|
} else {
|
|
|
|
assert(I + 2 < NewBBs.size());
|
|
|
|
updateCurrentBranchInfo();
|
2021-12-15 08:52:51 +08:00
|
|
|
NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount);
|
|
|
|
NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]);
|
2017-03-09 11:58:33 +08:00
|
|
|
ExecCount += TotalCount;
|
|
|
|
}
|
|
|
|
NewBBs[I]->setExecutionCount(ExecCount);
|
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
if (MergeBlock) {
|
|
|
|
// Arrange for the MergeBlock to be the fallthrough for the first
|
|
|
|
// promoted call block.
|
|
|
|
std::unique_ptr<BinaryBasicBlock> MBPtr;
|
|
|
|
std::swap(MBPtr, NewBBs.back());
|
|
|
|
NewBBs.pop_back();
|
|
|
|
NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
|
|
|
|
// TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
|
|
|
|
NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
|
|
|
|
}
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
2018-08-24 13:47:46 +08:00
|
|
|
// Update the execution count.
|
2017-03-09 11:58:33 +08:00
|
|
|
NewBBs.back()->setExecutionCount(TotalCount);
|
|
|
|
|
2018-08-24 13:47:46 +08:00
|
|
|
// Update BB and BB layout.
|
2021-10-26 15:06:34 +08:00
|
|
|
Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
|
2017-03-09 11:58:33 +08:00
|
|
|
assert(Function.validateCFG());
|
|
|
|
|
|
|
|
return MergeBlock;
|
|
|
|
}
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
size_t IndirectCallPromotion::canPromoteCallsite(
|
|
|
|
const BinaryBasicBlock &BB, const MCInst &Inst,
|
|
|
|
const std::vector<Callsite> &Targets, uint64_t NumCalls) {
|
|
|
|
if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
|
2020-07-28 09:07:18 +08:00
|
|
|
return 0;
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2017-11-30 09:38:39 +08:00
|
|
|
auto computeStats = [&](size_t N) {
|
2021-12-29 08:36:17 +08:00
|
|
|
for (size_t I = 0; I < N; ++I)
|
2017-11-30 09:38:39 +08:00
|
|
|
if (!IsJumpTable)
|
|
|
|
TotalNumFrequentCalls += Targets[I].Branches;
|
|
|
|
else
|
|
|
|
TotalNumFrequentJmps += Targets[I].Branches;
|
|
|
|
};
|
|
|
|
|
|
|
|
// If we have no targets (or no calls), skip this callsite.
|
2017-03-09 11:58:33 +08:00
|
|
|
if (Targets.empty() || !NumCalls) {
|
|
|
|
if (opts::Verbosity >= 1) {
|
2021-04-09 14:31:12 +08:00
|
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
2021-10-26 15:06:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
|
|
|
|
<< InstIdx << " in " << BB.getName() << ", calls = " << NumCalls
|
2017-03-09 11:58:33 +08:00
|
|
|
<< ", targets empty or NumCalls == 0.\n";
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
size_t TopN = opts::IndirectCallPromotionTopN;
|
|
|
|
if (IsJumpTable) {
|
|
|
|
if (opts::IndirectCallPromotionJumpTablesTopN != 0)
|
|
|
|
TopN = opts::IndirectCallPromotionJumpTablesTopN;
|
|
|
|
} else if (opts::IndirectCallPromotionCallsTopN != 0) {
|
|
|
|
TopN = opts::IndirectCallPromotionCallsTopN;
|
|
|
|
}
|
2021-04-08 15:19:26 +08:00
|
|
|
const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
if (opts::ICPTopCallsites > 0) {
|
2021-10-26 15:06:34 +08:00
|
|
|
BinaryContext &BC = BB.getFunction()->getBinaryContext();
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
|
|
|
|
return 0;
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Pick the top N targets.
|
|
|
|
uint64_t TotalMispredictsTopN = 0;
|
|
|
|
size_t N = 0;
|
|
|
|
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
if (opts::IndirectCallPromotionUseMispredicts &&
|
|
|
|
(!IsJumpTable || opts::ICPJumpTablesByTarget)) {
|
2017-03-09 11:58:33 +08:00
|
|
|
// Count total number of mispredictions for (at most) the top N targets.
|
|
|
|
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
|
|
|
// is exceeded by fewer targets.
|
|
|
|
double Threshold = double(opts::IndirectCallPromotionMispredictThreshold);
|
|
|
|
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
|
2017-11-30 09:38:39 +08:00
|
|
|
Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
|
2017-03-09 11:58:33 +08:00
|
|
|
TotalMispredictsTopN += Targets[I].Mispreds;
|
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
computeStats(N);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Compute the misprediction frequency of the top N call targets. If this
|
2021-10-26 15:06:34 +08:00
|
|
|
// frequency is greater than the threshold, we should try ICP on this
|
|
|
|
// callsite.
|
2017-03-09 11:58:33 +08:00
|
|
|
const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
|
|
|
|
if (TopNFrequency == 0 ||
|
|
|
|
TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) {
|
|
|
|
if (opts::Verbosity >= 1) {
|
2021-04-09 14:31:12 +08:00
|
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
2021-10-26 15:06:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
|
|
|
|
<< InstIdx << " in " << BB.getName() << ", calls = " << NumCalls
|
|
|
|
<< ", top N mis. frequency " << format("%.1f", TopNFrequency)
|
|
|
|
<< "% < " << opts::IndirectCallPromotionMispredictThreshold
|
|
|
|
<< "%\n";
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} else {
|
2017-11-30 09:38:39 +08:00
|
|
|
size_t MaxTargets = 0;
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Count total number of calls for (at most) the top N targets.
|
|
|
|
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
|
|
|
|
// is exceeded by fewer targets.
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
const unsigned TotalThreshold = IsJumpTable
|
|
|
|
? opts::ICPJTTotalPercentThreshold
|
|
|
|
: opts::ICPCallsTotalPercentThreshold;
|
|
|
|
const unsigned RemainingThreshold =
|
|
|
|
IsJumpTable ? opts::ICPJTRemainingPercentThreshold
|
|
|
|
: opts::ICPCallsRemainingPercentThreshold;
|
|
|
|
uint64_t NumRemainingCalls = NumCalls;
|
|
|
|
for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
|
|
|
|
if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
|
|
|
|
break;
|
|
|
|
if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
|
|
|
|
break;
|
|
|
|
if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
TrialN)
|
|
|
|
break;
|
2017-03-09 11:58:33 +08:00
|
|
|
TotalMispredictsTopN += Targets[I].Mispreds;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
NumRemainingCalls -= Targets[I].Branches;
|
|
|
|
N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
computeStats(MaxTargets);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
// Don't check misprediction frequency for jump tables -- we don't really
|
|
|
|
// care as long as we are saving loads from the jump table.
|
2017-10-21 03:11:34 +08:00
|
|
|
if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
|
|
|
|
// Compute the misprediction frequency of the top N call targets. If
|
|
|
|
// this frequency is less than the threshold, we should skip ICP at
|
|
|
|
// this callsite.
|
|
|
|
const double TopNMispredictFrequency =
|
2021-12-15 08:52:51 +08:00
|
|
|
(100.0 * TotalMispredictsTopN) / NumCalls;
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
if (TopNMispredictFrequency <
|
|
|
|
opts::IndirectCallPromotionMispredictThreshold) {
|
|
|
|
if (opts::Verbosity >= 1) {
|
2021-04-09 14:31:12 +08:00
|
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
2021-10-26 15:06:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " @ "
|
|
|
|
<< InstIdx << " in " << BB.getName()
|
|
|
|
<< ", calls = " << NumCalls << ", top N mispredict frequency "
|
2017-10-21 03:11:34 +08:00
|
|
|
<< format("%.1f", TopNMispredictFrequency) << "% < "
|
|
|
|
<< opts::IndirectCallPromotionMispredictThreshold << "%\n";
|
|
|
|
}
|
|
|
|
return 0;
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
// Filter functions that can have ICP applied (for debugging)
|
|
|
|
if (!opts::ICPFuncsList.empty()) {
|
2021-12-29 08:36:17 +08:00
|
|
|
for (std::string &Name : opts::ICPFuncsList)
|
2021-10-26 15:06:34 +08:00
|
|
|
if (BB.getFunction()->hasName(Name))
|
2017-10-21 03:11:34 +08:00
|
|
|
return N;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
return N;
|
|
|
|
}
|
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
void IndirectCallPromotion::printCallsiteInfo(
|
|
|
|
const BinaryBasicBlock &BB, const MCInst &Inst,
|
|
|
|
const std::vector<Callsite> &Targets, const size_t N,
|
|
|
|
uint64_t NumCalls) const {
|
|
|
|
BinaryContext &BC = BB.getFunction()->getBinaryContext();
|
2018-03-10 01:45:13 +08:00
|
|
|
const bool IsTailCall = BC.MIB->isTailCall(Inst);
|
2021-10-26 15:06:34 +08:00
|
|
|
const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
|
2021-04-09 14:31:12 +08:00
|
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
|
|
|
|
<< " @ " << InstIdx << " in " << BB.getName()
|
2017-03-09 11:58:33 +08:00
|
|
|
<< " -> calls = " << NumCalls
|
2017-11-30 09:38:39 +08:00
|
|
|
<< (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
|
|
|
|
<< "\n";
|
2017-03-09 11:58:33 +08:00
|
|
|
for (size_t I = 0; I < N; I++) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
|
|
|
|
const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
|
2017-11-30 09:38:39 +08:00
|
|
|
outs() << "BOLT-INFO: ";
|
2017-12-14 15:12:01 +08:00
|
|
|
if (Targets[I].To.Sym)
|
2017-03-09 11:58:33 +08:00
|
|
|
outs() << Targets[I].To.Sym->getName();
|
|
|
|
else
|
|
|
|
outs() << Targets[I].To.Addr;
|
|
|
|
outs() << ", calls = " << Targets[I].Branches
|
|
|
|
<< ", mispreds = " << Targets[I].Mispreds
|
|
|
|
<< ", taken freq = " << format("%.1f", Frequency) << "%"
|
|
|
|
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
|
2017-10-21 03:11:34 +08:00
|
|
|
bool First = true;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (uint64_t JTIndex : Targets[I].JTIndices) {
|
2017-10-21 03:11:34 +08:00
|
|
|
outs() << (First ? ", indices = " : ", ") << JTIndex;
|
|
|
|
First = false;
|
|
|
|
}
|
2017-11-30 09:38:39 +08:00
|
|
|
outs() << "\n";
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG({
|
2017-11-30 09:38:39 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP original call instruction:";
|
2017-03-09 11:58:33 +08:00
|
|
|
BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2019-04-04 06:52:01 +08:00
|
|
|
void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
|
2017-03-09 11:58:33 +08:00
|
|
|
if (opts::IndirectCallPromotion == ICP_NONE)
|
|
|
|
return;
|
|
|
|
|
2019-04-04 06:52:01 +08:00
|
|
|
auto &BFs = BC.getBinaryFunctions();
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
const bool OptimizeCalls = (opts::IndirectCallPromotion == ICP_CALLS ||
|
|
|
|
opts::IndirectCallPromotion == ICP_ALL);
|
2017-10-21 03:11:34 +08:00
|
|
|
const bool OptimizeJumpTables =
|
2021-12-15 08:52:51 +08:00
|
|
|
(opts::IndirectCallPromotion == ICP_JUMP_TABLES ||
|
|
|
|
opts::IndirectCallPromotion == ICP_ALL);
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2017-06-03 07:57:22 +08:00
|
|
|
std::unique_ptr<RegAnalysis> RA;
|
|
|
|
std::unique_ptr<BinaryFunctionCallGraph> CG;
|
2018-08-31 04:21:50 +08:00
|
|
|
if (OptimizeJumpTables) {
|
2019-04-04 06:52:01 +08:00
|
|
|
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
|
2018-06-12 04:18:44 +08:00
|
|
|
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
|
2017-06-03 07:57:22 +08:00
|
|
|
}
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
// If icp-top-callsites is enabled, compute the total number of indirect
|
|
|
|
// calls and then optimize the hottest callsites that contribute to that
|
|
|
|
// total.
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
SetVector<BinaryFunction *> Functions;
|
|
|
|
if (opts::ICPTopCallsites == 0) {
|
2021-12-29 08:36:17 +08:00
|
|
|
for (auto &KV : BFs)
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
Functions.insert(&KV.second);
|
|
|
|
} else {
|
|
|
|
using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
|
2017-10-21 03:11:34 +08:00
|
|
|
std::vector<IndirectCallsite> IndirectCalls;
|
|
|
|
size_t TotalIndirectCalls = 0;
|
|
|
|
|
|
|
|
// Find all the indirect callsites.
|
|
|
|
for (auto &BFIt : BFs) {
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryFunction &Function = BFIt.second;
|
2017-10-21 03:11:34 +08:00
|
|
|
|
2020-05-04 04:54:45 +08:00
|
|
|
if (!Function.isSimple() || Function.isIgnored() ||
|
2017-12-14 15:12:01 +08:00
|
|
|
!Function.hasProfile())
|
2017-10-21 03:11:34 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
const bool HasLayout = !Function.layout_empty();
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock &BB : Function) {
|
2017-10-21 03:11:34 +08:00
|
|
|
if (HasLayout && Function.isSplit() && BB.isCold())
|
|
|
|
continue;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (MCInst &Inst : BB) {
|
2017-11-20 03:17:57 +08:00
|
|
|
const bool IsJumpTable = Function.getJumpTable(Inst);
|
2017-12-14 15:12:01 +08:00
|
|
|
const bool HasIndirectCallProfile =
|
2021-12-15 08:52:51 +08:00
|
|
|
BC.MIB->hasAnnotation(Inst, "CallProfile");
|
|
|
|
const bool IsDirectCall =
|
|
|
|
(BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0));
|
2017-11-20 03:17:57 +08:00
|
|
|
|
|
|
|
if (!IsDirectCall &&
|
2017-12-14 15:12:01 +08:00
|
|
|
((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
|
2017-11-20 03:17:57 +08:00
|
|
|
(IsJumpTable && OptimizeJumpTables))) {
|
|
|
|
uint64_t NumCalls = 0;
|
2021-12-29 08:36:17 +08:00
|
|
|
for (const Callsite &BInfo : getCallTargets(BB, Inst))
|
2017-11-20 03:17:57 +08:00
|
|
|
NumCalls += BInfo.Branches;
|
2020-05-15 08:34:20 +08:00
|
|
|
IndirectCalls.push_back(
|
|
|
|
std::make_tuple(NumCalls, &Inst, &Function));
|
2017-11-20 03:17:57 +08:00
|
|
|
TotalIndirectCalls += NumCalls;
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort callsites by execution count.
|
2017-11-20 03:17:57 +08:00
|
|
|
std::sort(IndirectCalls.rbegin(), IndirectCalls.rend());
|
2017-10-21 03:11:34 +08:00
|
|
|
|
|
|
|
// Find callsites that contribute to the top "opts::ICPTopCallsites"%
|
|
|
|
// number of calls.
|
|
|
|
const float TopPerc = opts::ICPTopCallsites / 100.0f;
|
|
|
|
int64_t MaxCalls = TotalIndirectCalls * TopPerc;
|
2019-12-14 08:46:00 +08:00
|
|
|
uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
|
2017-10-21 03:11:34 +08:00
|
|
|
size_t Num = 0;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (const IndirectCallsite &IC : IndirectCalls) {
|
2019-12-14 08:46:00 +08:00
|
|
|
const uint64_t CurFreq = std::get<0>(IC);
|
|
|
|
// Once we decide to stop, include at least all branches that share the
|
|
|
|
// same frequency of the last one to avoid non-deterministic behavior
|
|
|
|
// (e.g. turning on/off ICP depending on the order of functions)
|
|
|
|
if (MaxCalls <= 0 && CurFreq != LastFreq)
|
2017-10-21 03:11:34 +08:00
|
|
|
break;
|
2019-12-14 08:46:00 +08:00
|
|
|
MaxCalls -= CurFreq;
|
|
|
|
LastFreq = CurFreq;
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
|
|
|
|
Functions.insert(std::get<2>(IC));
|
2017-10-21 03:11:34 +08:00
|
|
|
++Num;
|
|
|
|
}
|
|
|
|
outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
|
2017-12-14 15:12:01 +08:00
|
|
|
<< ", " << Num << " callsites cover " << opts::ICPTopCallsites
|
|
|
|
<< "% of all indirect calls\n";
|
2017-10-21 03:11:34 +08:00
|
|
|
}
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryFunction *FuncPtr : Functions) {
|
|
|
|
BinaryFunction &Function = *FuncPtr;
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2020-05-04 04:54:45 +08:00
|
|
|
if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile())
|
2017-03-09 11:58:33 +08:00
|
|
|
continue;
|
2017-07-18 02:22:22 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
const bool HasLayout = !Function.layout_empty();
|
|
|
|
|
|
|
|
// Total number of indirect calls issued from the current Function.
|
|
|
|
// (a fraction of TotalIndirectCalls)
|
|
|
|
uint64_t FuncTotalIndirectCalls = 0;
|
|
|
|
uint64_t FuncTotalIndirectJmps = 0;
|
|
|
|
|
|
|
|
std::vector<BinaryBasicBlock *> BBs;
|
2021-04-08 15:19:26 +08:00
|
|
|
for (BinaryBasicBlock &BB : Function) {
|
2017-03-09 11:58:33 +08:00
|
|
|
// Skip indirect calls in cold blocks.
|
2021-12-29 08:36:17 +08:00
|
|
|
if (!HasLayout || !Function.isSplit() || !BB.isCold())
|
2017-03-09 11:58:33 +08:00
|
|
|
BBs.push_back(&BB);
|
|
|
|
}
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
if (BBs.empty())
|
|
|
|
continue;
|
2017-03-09 11:58:33 +08:00
|
|
|
|
2021-10-26 15:06:34 +08:00
|
|
|
DataflowInfoManager Info(Function, RA.get(), nullptr);
|
2017-03-09 11:58:33 +08:00
|
|
|
while (!BBs.empty()) {
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryBasicBlock *BB = BBs.back();
|
2017-03-09 11:58:33 +08:00
|
|
|
BBs.pop_back();
|
|
|
|
|
|
|
|
for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
|
2021-04-08 15:19:26 +08:00
|
|
|
MCInst &Inst = BB->getInstructionAtIndex(Idx);
|
2021-04-09 14:31:12 +08:00
|
|
|
const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
|
2018-03-10 01:45:13 +08:00
|
|
|
const bool IsTailCall = BC.MIB->isTailCall(Inst);
|
2017-12-14 15:12:01 +08:00
|
|
|
const bool HasIndirectCallProfile =
|
2021-12-15 08:52:51 +08:00
|
|
|
BC.MIB->hasAnnotation(Inst, "CallProfile");
|
2017-03-09 11:58:33 +08:00
|
|
|
const bool IsJumpTable = Function.getJumpTable(Inst);
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
|
2021-12-29 08:36:17 +08:00
|
|
|
if (BC.MIB->isCall(Inst))
|
2017-11-30 09:38:39 +08:00
|
|
|
TotalCalls += BB->getKnownExecutionCount();
|
|
|
|
|
2018-08-31 04:21:50 +08:00
|
|
|
if (IsJumpTable && !OptimizeJumpTables)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
|
2017-03-09 11:58:33 +08:00
|
|
|
continue;
|
|
|
|
|
2017-10-13 05:57:11 +08:00
|
|
|
// Ignore direct calls.
|
2018-03-10 01:45:13 +08:00
|
|
|
if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
|
2017-10-13 05:57:11 +08:00
|
|
|
continue;
|
|
|
|
|
2021-12-15 08:52:51 +08:00
|
|
|
assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
|
|
|
|
"expected a call or an indirect jump instruction");
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
if (IsJumpTable)
|
|
|
|
++TotalJumpTableCallsites;
|
|
|
|
else
|
|
|
|
++TotalIndirectCallsites;
|
|
|
|
|
2021-04-08 15:19:26 +08:00
|
|
|
std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Compute the total number of calls from this particular callsite.
|
|
|
|
uint64_t NumCalls = 0;
|
2021-12-29 08:36:17 +08:00
|
|
|
for (const Callsite &BInfo : Targets)
|
2017-03-09 11:58:33 +08:00
|
|
|
NumCalls += BInfo.Branches;
|
|
|
|
if (!IsJumpTable)
|
|
|
|
FuncTotalIndirectCalls += NumCalls;
|
|
|
|
else
|
|
|
|
FuncTotalIndirectJmps += NumCalls;
|
|
|
|
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
// If FLAGS regs is alive after this jmp site, do not try
|
|
|
|
// promoting because we will clobber FLAGS.
|
2017-10-21 03:11:34 +08:00
|
|
|
if (IsJumpTable) {
|
2021-04-08 15:19:26 +08:00
|
|
|
ErrorOr<const BitVector &> State =
|
|
|
|
Info.getLivenessAnalysis().getStateBefore(Inst);
|
2018-03-10 01:45:13 +08:00
|
|
|
if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
2017-10-21 03:11:34 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
|
|
<< InstIdx << " in " << BB->getName()
|
|
|
|
<< ", calls = " << NumCalls
|
|
|
|
<< (State ? ", cannot clobber flags reg.\n"
|
|
|
|
: ", no liveness data available.\n");
|
|
|
|
continue;
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Should this callsite be optimized? Return the number of targets
|
|
|
|
// to use when promoting this call. A value of zero means to skip
|
|
|
|
// this callsite.
|
2021-10-26 15:06:34 +08:00
|
|
|
size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
// If it is a jump table and it failed to meet our initial threshold,
|
|
|
|
// proceed to findCallTargetSymbols -- it may reevaluate N if
|
|
|
|
// memory profile is present
|
|
|
|
if (!N && !IsJumpTable)
|
2017-03-09 11:58:33 +08:00
|
|
|
continue;
|
|
|
|
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
2021-10-26 15:06:34 +08:00
|
|
|
printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Find MCSymbols or absolute addresses for each call target.
|
2017-10-21 03:11:34 +08:00
|
|
|
MCInst *TargetFetchInst = nullptr;
|
2021-10-26 15:06:34 +08:00
|
|
|
const SymTargetsType SymTargets =
|
|
|
|
findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
// findCallTargetSymbols may have changed N if mem profile is available
|
|
|
|
// for jump tables
|
|
|
|
if (!N)
|
|
|
|
continue;
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG(printDecision(dbgs(), Targets, N));
|
[BOLT] Improve ICP activation policy and hot jt processing
Summary:
Previously, ICP worked with a budget of N targets to convert to
direct calls. As long as the frequency of up to N of the hottest targets
surpassed a given fraction (threshold) of the total frequency, say, 90%,
then the optimization would convert a number of targets (up to N) to
direct calls. Otherwise, it would completely abort processing this call
site. The intent was to convert a given fraction of the indirect call
site frequency to use direct calls instead, but this ends up being a
"all or nothing" strategy.
In this patch we change this to operate with the same strategy seem in
LLVM's ICP, with two thresholds. The idea is that the hottest target of
an indirect call site will be compared against these two thresholds: one
checks its frequency relative to the total frequency of the original
indirect call site, and the other checks its frequency relative to the
remaining, unconverted targets (excluding the hottest targets that were
already converted to direct calls). The remaining threshold is typically
set higher than the total threshold. This allows us more control over
ICP.
I expose two pairs of knobs, one for jump tables and another for
indirect calls.
To improve the promotion of hot jump table indices when we have memory
profile, I also fix a bug that could cause us to promote extra indices
besides the hottest ones as seen in the memory profile. When we have the
memory profile, I reapply the dual threshold checks to the memory
profile which specifies exactly which indices are hot. I then update N,
the number of targets to be promoted, based on this new information, and
update frequency information.
To allow us to work with smaller profiles, I also created an option in
perf2bolt to filter out memory samples outside the statically allocated
area of the binary (heap/stack). This option is on by default.
(cherry picked from FBD15187832)
2019-05-03 03:28:34 +08:00
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// If we can't resolve any of the target symbols, punt on this callsite.
|
2017-10-21 03:11:34 +08:00
|
|
|
// TODO: can this ever happen?
|
2017-03-09 11:58:33 +08:00
|
|
|
if (SymTargets.size() < N) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const size_t LastTarget = SymTargets.size();
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
|
|
<< InstIdx << " in " << BB->getName()
|
|
|
|
<< ", calls = " << NumCalls
|
|
|
|
<< ", ICP failed to find target symbol for "
|
|
|
|
<< Targets[LastTarget].To.Sym->getName() << "\n";
|
2017-03-09 11:58:33 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-10-21 03:11:34 +08:00
|
|
|
MethodInfoType MethodInfo;
|
|
|
|
|
|
|
|
if (!IsJumpTable) {
|
2021-10-26 15:06:34 +08:00
|
|
|
MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
|
2017-10-21 03:11:34 +08:00
|
|
|
TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG(dbgs()
|
|
|
|
<< "BOLT-INFO: ICP "
|
|
|
|
<< (!MethodInfo.first.empty() ? "found" : "did not find")
|
|
|
|
<< " vtables for all methods.\n");
|
2017-10-21 03:11:34 +08:00
|
|
|
} else if (TargetFetchInst) {
|
|
|
|
++TotalIndexBasedJumps;
|
|
|
|
MethodInfo.second.push_back(TargetFetchInst);
|
|
|
|
}
|
|
|
|
|
2017-03-09 11:58:33 +08:00
|
|
|
// Generate new promoted call code for this callsite.
|
2021-04-08 15:19:26 +08:00
|
|
|
MCPlusBuilder::BlocksVectorTy ICPcode =
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
(IsJumpTable && !opts::ICPJumpTablesByTarget)
|
2021-04-08 15:19:26 +08:00
|
|
|
? BC.MIB->jumpTablePromotion(Inst, SymTargets,
|
|
|
|
MethodInfo.second, BC.Ctx.get())
|
2018-03-10 01:45:13 +08:00
|
|
|
: BC.MIB->indirectCallPromotion(
|
2017-10-21 03:11:34 +08:00
|
|
|
Inst, SymTargets, MethodInfo.first, MethodInfo.second,
|
|
|
|
opts::ICPOldCodeSequence, BC.Ctx.get());
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
if (ICPcode.empty()) {
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
2017-03-09 11:58:33 +08:00
|
|
|
outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
|
|
|
|
<< InstIdx << " in " << BB->getName()
|
|
|
|
<< ", calls = " << NumCalls
|
|
|
|
<< ", unable to generate promoted call code.\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-12-02 08:29:39 +08:00
|
|
|
LLVM_DEBUG({
|
2021-04-08 15:19:26 +08:00
|
|
|
uint64_t Offset = Targets[0].From.Addr;
|
2017-03-09 11:58:33 +08:00
|
|
|
dbgs() << "BOLT-INFO: ICP indirect call code:\n";
|
|
|
|
for (const auto &entry : ICPcode) {
|
2021-04-08 15:19:26 +08:00
|
|
|
const MCSymbol *const &Sym = entry.first;
|
2021-12-10 03:53:12 +08:00
|
|
|
const InstructionListType &Insts = entry.second;
|
2021-12-15 08:52:51 +08:00
|
|
|
if (Sym)
|
|
|
|
dbgs() << Sym->getName() << ":\n";
|
|
|
|
Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
|
2017-03-09 11:58:33 +08:00
|
|
|
Offset);
|
|
|
|
}
|
|
|
|
dbgs() << "---------------------------------------------------\n";
|
|
|
|
});
|
|
|
|
|
|
|
|
// Rewrite the CFG with the newly generated ICP code.
|
2021-10-26 15:06:34 +08:00
|
|
|
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
|
|
|
|
rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Fix the CFG after inserting the new basic blocks.
|
2021-04-08 15:19:26 +08:00
|
|
|
BinaryBasicBlock *MergeBlock =
|
2021-10-26 15:06:34 +08:00
|
|
|
fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);
|
2017-03-09 11:58:33 +08:00
|
|
|
|
|
|
|
// Since the tail of the original block was split off and it may contain
|
|
|
|
// additional indirect calls, we must add the merge block to the set of
|
|
|
|
// blocks to process.
|
2021-12-29 08:36:17 +08:00
|
|
|
if (MergeBlock)
|
2017-03-09 11:58:33 +08:00
|
|
|
BBs.push_back(MergeBlock);
|
|
|
|
|
2021-12-29 08:36:17 +08:00
|
|
|
if (opts::Verbosity >= 1)
|
2021-12-15 08:52:51 +08:00
|
|
|
outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
|
|
|
|
<< InstIdx << " in " << BB->getName()
|
2017-03-09 11:58:33 +08:00
|
|
|
<< " -> calls = " << NumCalls << "\n";
|
|
|
|
|
|
|
|
if (IsJumpTable)
|
|
|
|
++TotalOptimizedJumpTableCallsites;
|
|
|
|
else
|
|
|
|
++TotalOptimizedIndirectCallsites;
|
|
|
|
|
|
|
|
Modified.insert(&Function);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
TotalIndirectCalls += FuncTotalIndirectCalls;
|
|
|
|
TotalIndirectJmps += FuncTotalIndirectJmps;
|
|
|
|
}
|
|
|
|
|
2017-12-14 15:12:01 +08:00
|
|
|
outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< TotalIndirectCallsites << "\n"
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "BOLT-INFO: ICP total jump table callsites = "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< TotalJumpTableCallsites << "\n"
|
|
|
|
<< "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of calls that are indirect = "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of indirect calls that can be "
|
|
|
|
"optimized = "
|
|
|
|
<< format("%.1f", (100.0 * TotalNumFrequentCalls) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<size_t>(TotalIndirectCalls, 1))
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "%\n"
|
2017-12-14 15:12:01 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of indirect callsites that are "
|
|
|
|
"optimized = "
|
2017-03-09 11:58:33 +08:00
|
|
|
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<uint64_t>(TotalIndirectCallsites, 1))
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "%\n"
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "BOLT-INFO: ICP number of method load elimination candidates = "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< TotalMethodLoadEliminationCandidates << "\n"
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of method calls candidates that have "
|
|
|
|
"loads eliminated = "
|
|
|
|
<< format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<uint64_t>(
|
|
|
|
TotalMethodLoadEliminationCandidates, 1))
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "%\n"
|
[BOLT] Optimize jump tables with hot entries
Summary:
This diff is similar to Bill's diff for optimizing jump tables
(and is built on top of it), but it differs in the strategy used to
optimize the jump table. The previous approach loads the target address
from the jump table and compare it to check if it is a hot target. This
accomplishes branch misprediction reduction by promote the indirect jmp
to a (more predictable) direct jmp.
load %r10, JMPTABLE
cmp %r10, HOTTARGET
je HOTTARGET
ijmp [JMPTABLE + %index * scale]
The idea in this diff is instead to make dcache better by avoiding the
load of the jump table, leaving branch mispredictions as a secondary
target. To do this we compare the index used in the indirect jmp and if
it matches a known hot entry, it performs a direct jump to the target.
cmp %index, HOTINDEX
je CORRESPONDING_TARGET
ijmp [JMPTABLE + %index * scale]
The downside of this approach is that we may have multiple indices
associated with a single target, but we only have profiling to show
which targets are hot and we have no clue about which indices are hot.
INDEX TARGET
0 4004f8
8 4004f8
10 4003d0
18 4004f8
Profiling data:
TARGET COUNT
4004f8 10020
4003d0 17
In this example, we know 4004f8 is hot, but to make a direct call to it
we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1.
Therefore, once we know a target is hot, we must generate code to
compare against all possible indices associated with this target because
we don't know which index is the hot one (IF there's a hotter index).
cmp %index, 0
je 4004f8
cmp %index, 8
je 4004f8
cmp %index, 18
je 4004f8
(... up to N comparisons as in --indirect-call-promotion-topn=N )
ijmp [JMPTABLE + %index * scale]
(cherry picked from FBD5005620)
2017-05-02 05:04:40 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of indirect branches that are "
|
2017-03-09 11:58:33 +08:00
|
|
|
"optimized = "
|
|
|
|
<< format("%.1f", (100.0 * TotalNumFrequentJmps) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<uint64_t>(TotalIndirectJmps, 1))
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "%\n"
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of jump table callsites that are "
|
|
|
|
<< "optimized = "
|
2017-03-09 11:58:33 +08:00
|
|
|
<< format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<uint64_t>(TotalJumpTableCallsites, 1))
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "%\n"
|
|
|
|
<< "BOLT-INFO: ICP number of jump table callsites that can use hot "
|
2021-12-15 08:52:51 +08:00
|
|
|
<< "indices = " << TotalIndexBasedCandidates << "\n"
|
2017-10-21 03:11:34 +08:00
|
|
|
<< "BOLT-INFO: ICP percentage of jump table callsites that use hot "
|
|
|
|
"indices = "
|
|
|
|
<< format("%.1f", (100.0 * TotalIndexBasedJumps) /
|
2021-12-15 08:52:51 +08:00
|
|
|
std::max<uint64_t>(TotalIndexBasedCandidates, 1))
|
2017-03-09 11:58:33 +08:00
|
|
|
<< "%\n";
|
2018-08-31 04:21:50 +08:00
|
|
|
|
2021-06-30 03:11:56 +08:00
|
|
|
(void)verifyProfile;
|
2018-08-31 04:21:50 +08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
verifyProfile(BFs);
|
|
|
|
#endif
|
2017-03-09 11:58:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace bolt
|
|
|
|
} // namespace llvm
|