forked from OSchip/llvm-project
[InlineCost] Enable the new switch cost heuristic
Summary: This is to enable the new switch inline cost heuristic (r301649) by removing the old heuristic as well as the flag itself. In my experiment for LLVM test suite and spec2000/2006, +17.82% performance and 8% code size reduce was observed in spec2000/vertex with O3 LTO in AArch64. No significant code size / performance regression was found in O3/O2/Os. No significant complain was reported from the llvm-dev thread. Reviewers: hans, chandlerc, eraman, haicheng, mcrosier, bmakam, eastig, ddibyend, echristo Reviewed By: echristo Subscribers: javed.absar, kristof.beyls, echristo, aemerson, rengolin, mehdi_amini Differential Revision: https://reviews.llvm.org/D32653 llvm-svn: 304594
This commit is contained in:
parent
2c08fde9e5
commit
2960d41e68
|
@ -54,11 +54,6 @@ static cl::opt<int>
|
||||||
cl::init(45),
|
cl::init(45),
|
||||||
cl::desc("Threshold for inlining cold callsites"));
|
cl::desc("Threshold for inlining cold callsites"));
|
||||||
|
|
||||||
static cl::opt<bool>
|
|
||||||
EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
|
|
||||||
cl::init(false),
|
|
||||||
cl::desc("Enable generic switch cost model"));
|
|
||||||
|
|
||||||
// We introduce this threshold to help performance of instrumentation based
|
// We introduce this threshold to help performance of instrumentation based
|
||||||
// PGO before we actually hook up inliner with analysis passes such as BPI and
|
// PGO before we actually hook up inliner with analysis passes such as BPI and
|
||||||
// BFI.
|
// BFI.
|
||||||
|
@ -1015,83 +1010,68 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
|
||||||
if (isa<ConstantInt>(V))
|
if (isa<ConstantInt>(V))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (EnableGenericSwitchCost) {
|
// Assume the most general case where the swith is lowered into
|
||||||
// Assume the most general case where the swith is lowered into
|
// either a jump table, bit test, or a balanced binary tree consisting of
|
||||||
// either a jump table, bit test, or a balanced binary tree consisting of
|
// case clusters without merging adjacent clusters with the same
|
||||||
// case clusters without merging adjacent clusters with the same
|
// destination. We do not consider the switches that are lowered with a mix
|
||||||
// destination. We do not consider the switches that are lowered with a mix
|
// of jump table/bit test/binary search tree. The cost of the switch is
|
||||||
// of jump table/bit test/binary search tree. The cost of the switch is
|
// proportional to the size of the tree or the size of jump table range.
|
||||||
// proportional to the size of the tree or the size of jump table range.
|
//
|
||||||
|
|
||||||
// Exit early for a large switch, assuming one case needs at least one
|
|
||||||
// instruction.
|
|
||||||
// FIXME: This is not true for a bit test, but ignore such case for now to
|
|
||||||
// save compile-time.
|
|
||||||
int64_t CostLowerBound =
|
|
||||||
std::min((int64_t)INT_MAX,
|
|
||||||
(int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
|
|
||||||
|
|
||||||
if (CostLowerBound > Threshold) {
|
|
||||||
Cost = CostLowerBound;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned JumpTableSize = 0;
|
|
||||||
unsigned NumCaseCluster =
|
|
||||||
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
|
|
||||||
|
|
||||||
// If suitable for a jump table, consider the cost for the table size and
|
|
||||||
// branch to destination.
|
|
||||||
if (JumpTableSize) {
|
|
||||||
int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
|
|
||||||
4 * InlineConstants::InstrCost;
|
|
||||||
Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Considering forming a binary search, we should find the number of nodes
|
|
||||||
// which is same as the number of comparisons when lowered. For a given
|
|
||||||
// number of clusters, n, we can define a recursive function, f(n), to find
|
|
||||||
// the number of nodes in the tree. The recursion is :
|
|
||||||
// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
|
|
||||||
// and f(n) = n, when n <= 3.
|
|
||||||
// This will lead a binary tree where the leaf should be either f(2) or f(3)
|
|
||||||
// when n > 3. So, the number of comparisons from leaves should be n, while
|
|
||||||
// the number of non-leaf should be :
|
|
||||||
// 2^(log2(n) - 1) - 1
|
|
||||||
// = 2^log2(n) * 2^-1 - 1
|
|
||||||
// = n / 2 - 1.
|
|
||||||
// Considering comparisons from leaf and non-leaf nodes, we can estimate the
|
|
||||||
// number of comparisons in a simple closed form :
|
|
||||||
// n + n / 2 - 1 = n * 3 / 2 - 1
|
|
||||||
if (NumCaseCluster <= 3) {
|
|
||||||
// Suppose a comparison includes one compare and one conditional branch.
|
|
||||||
Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
|
|
||||||
uint64_t SwitchCost =
|
|
||||||
ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
|
|
||||||
Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use a simple switch cost model where we accumulate a cost proportional to
|
|
||||||
// the number of distinct successor blocks. This fan-out in the CFG cannot
|
|
||||||
// be represented for free even if we can represent the core switch as a
|
|
||||||
// jumptable that takes a single instruction.
|
|
||||||
///
|
|
||||||
// NB: We convert large switches which are just used to initialize large phi
|
// NB: We convert large switches which are just used to initialize large phi
|
||||||
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
|
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
|
||||||
// inlining those. It will prevent inlining in cases where the optimization
|
// inlining those. It will prevent inlining in cases where the optimization
|
||||||
// does not (yet) fire.
|
// does not (yet) fire.
|
||||||
SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
|
|
||||||
SuccessorBlocks.insert(SI.getDefaultDest());
|
// Exit early for a large switch, assuming one case needs at least one
|
||||||
for (auto Case : SI.cases())
|
// instruction.
|
||||||
SuccessorBlocks.insert(Case.getCaseSuccessor());
|
// FIXME: This is not true for a bit test, but ignore such case for now to
|
||||||
// Add cost corresponding to the number of distinct destinations. The first
|
// save compile-time.
|
||||||
// we model as free because of fallthrough.
|
int64_t CostLowerBound =
|
||||||
Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
|
std::min((int64_t)INT_MAX,
|
||||||
|
(int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
|
||||||
|
|
||||||
|
if (CostLowerBound > Threshold) {
|
||||||
|
Cost = CostLowerBound;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned JumpTableSize = 0;
|
||||||
|
unsigned NumCaseCluster =
|
||||||
|
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
|
||||||
|
|
||||||
|
// If suitable for a jump table, consider the cost for the table size and
|
||||||
|
// branch to destination.
|
||||||
|
if (JumpTableSize) {
|
||||||
|
int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
|
||||||
|
4 * InlineConstants::InstrCost;
|
||||||
|
Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Considering forming a binary search, we should find the number of nodes
|
||||||
|
// which is same as the number of comparisons when lowered. For a given
|
||||||
|
// number of clusters, n, we can define a recursive function, f(n), to find
|
||||||
|
// the number of nodes in the tree. The recursion is :
|
||||||
|
// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
|
||||||
|
// and f(n) = n, when n <= 3.
|
||||||
|
// This will lead a binary tree where the leaf should be either f(2) or f(3)
|
||||||
|
// when n > 3. So, the number of comparisons from leaves should be n, while
|
||||||
|
// the number of non-leaf should be :
|
||||||
|
// 2^(log2(n) - 1) - 1
|
||||||
|
// = 2^log2(n) * 2^-1 - 1
|
||||||
|
// = n / 2 - 1.
|
||||||
|
// Considering comparisons from leaf and non-leaf nodes, we can estimate the
|
||||||
|
// number of comparisons in a simple closed form :
|
||||||
|
// n + n / 2 - 1 = n * 3 / 2 - 1
|
||||||
|
if (NumCaseCluster <= 3) {
|
||||||
|
// Suppose a comparison includes one compare and one conditional branch.
|
||||||
|
Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
|
||||||
|
uint64_t SwitchCost =
|
||||||
|
ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
|
||||||
|
Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
|
; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux | FileCheck %s
|
||||||
; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
|
; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux | FileCheck %s
|
||||||
|
|
||||||
define i32 @callee_range(i32 %a, i32* %P) {
|
define i32 @callee_range(i32 %a, i32* %P) {
|
||||||
switch i32 %a, label %sw.default [
|
switch i32 %a, label %sw.default [
|
||||||
|
|
Loading…
Reference in New Issue