2016-04-11 21:58:45 +08:00
|
|
|
//===- ModuleSummaryAnalysis.cpp - Module summary index builder -----------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass builds a ModuleSummaryIndex object for the module, to be written
|
|
|
|
// to bitcode or LLVM assembly.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
|
|
|
|
#include "llvm/Analysis/BlockFrequencyInfo.h"
|
|
|
|
#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
|
|
|
|
#include "llvm/Analysis/BranchProbabilityInfo.h"
|
2016-07-17 22:47:01 +08:00
|
|
|
#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
|
2016-04-11 21:58:45 +08:00
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
#include "llvm/Analysis/ProfileSummaryInfo.h"
|
2016-04-11 21:58:45 +08:00
|
|
|
#include "llvm/IR/CallSite.h"
|
|
|
|
#include "llvm/IR/Dominators.h"
|
2016-04-27 22:19:38 +08:00
|
|
|
#include "llvm/IR/InstIterator.h"
|
2016-04-11 21:58:45 +08:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
|
|
#include "llvm/IR/ValueSymbolTable.h"
|
|
|
|
#include "llvm/Pass.h"
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "module-summary-analysis"
|
|
|
|
|
|
|
|
// Walk through the operands of a given User via worklist iteration and populate
|
|
|
|
// the set of GlobalValue references encountered. Invoked either on an
|
|
|
|
// Instruction or a GlobalVariable (which walks its initializer).
|
|
|
|
static void findRefEdges(const User *CurUser, DenseSet<const Value *> &RefEdges,
|
|
|
|
SmallPtrSet<const User *, 8> &Visited) {
|
|
|
|
SmallVector<const User *, 32> Worklist;
|
|
|
|
Worklist.push_back(CurUser);
|
|
|
|
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
const User *U = Worklist.pop_back_val();
|
|
|
|
|
|
|
|
if (!Visited.insert(U).second)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ImmutableCallSite CS(U);
|
|
|
|
|
|
|
|
for (const auto &OI : U->operands()) {
|
|
|
|
const User *Operand = dyn_cast<User>(OI);
|
|
|
|
if (!Operand)
|
|
|
|
continue;
|
|
|
|
if (isa<BlockAddress>(Operand))
|
|
|
|
continue;
|
|
|
|
if (isa<GlobalValue>(Operand)) {
|
|
|
|
// We have a reference to a global value. This should be added to
|
|
|
|
// the reference set unless it is a callee. Callees are handled
|
|
|
|
// specially by WriteFunction and are added to a separate list.
|
|
|
|
if (!(CS && CS.isCallee(&OI)))
|
|
|
|
RefEdges.insert(Operand);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
Worklist.push_back(Operand);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount,
|
|
|
|
ProfileSummaryInfo *PSI) {
|
|
|
|
if (!PSI)
|
|
|
|
return CalleeInfo::HotnessType::Unknown;
|
|
|
|
if (PSI->isHotCount(ProfileCount))
|
|
|
|
return CalleeInfo::HotnessType::Hot;
|
|
|
|
if (PSI->isColdCount(ProfileCount))
|
|
|
|
return CalleeInfo::HotnessType::Cold;
|
|
|
|
return CalleeInfo::HotnessType::None;
|
|
|
|
}
|
|
|
|
|
2016-08-19 15:49:19 +08:00
|
|
|
static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
const Function &F, BlockFrequencyInfo *BFI,
|
|
|
|
ProfileSummaryInfo *PSI) {
|
2016-04-11 21:58:45 +08:00
|
|
|
// Summary not currently supported for anonymous functions, they must
|
|
|
|
// be renamed.
|
|
|
|
if (!F.hasName())
|
|
|
|
return;
|
|
|
|
|
|
|
|
unsigned NumInsts = 0;
|
|
|
|
// Map from callee ValueId to profile count. Used to accumulate profile
|
|
|
|
// counts for all static calls to a given callee.
|
|
|
|
DenseMap<const Value *, CalleeInfo> CallGraphEdges;
|
2016-07-17 22:47:01 +08:00
|
|
|
DenseMap<GlobalValue::GUID, CalleeInfo> IndirectCallEdges;
|
2016-04-11 21:58:45 +08:00
|
|
|
DenseSet<const Value *> RefEdges;
|
2016-07-17 22:47:01 +08:00
|
|
|
ICallPromotionAnalysis ICallAnalysis;
|
2016-04-11 21:58:45 +08:00
|
|
|
|
|
|
|
SmallPtrSet<const User *, 8> Visited;
|
2016-06-27 01:27:42 +08:00
|
|
|
for (const BasicBlock &BB : F)
|
|
|
|
for (const Instruction &I : BB) {
|
2016-08-30 08:46:26 +08:00
|
|
|
if (isa<DbgInfoIntrinsic>(I))
|
|
|
|
continue;
|
|
|
|
++NumInsts;
|
|
|
|
findRefEdges(&I, RefEdges, Visited);
|
|
|
|
auto CS = ImmutableCallSite(&I);
|
|
|
|
if (!CS)
|
|
|
|
continue;
|
|
|
|
auto *CalledFunction = CS.getCalledFunction();
|
|
|
|
// Check if this is a direct call to a known function.
|
|
|
|
if (CalledFunction) {
|
|
|
|
// Skip nameless and intrinsics.
|
|
|
|
if (!CalledFunction->hasName() || CalledFunction->isIntrinsic())
|
|
|
|
continue;
|
|
|
|
auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
|
|
|
|
auto *CalleeId =
|
|
|
|
M.getValueSymbolTable().lookup(CalledFunction->getName());
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
|
|
|
|
auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
|
|
|
|
: CalleeInfo::HotnessType::Unknown;
|
|
|
|
CallGraphEdges[CalleeId].updateHotness(Hotness);
|
2016-08-30 08:46:26 +08:00
|
|
|
} else {
|
|
|
|
const auto *CI = dyn_cast<CallInst>(&I);
|
|
|
|
// Skip inline assembly calls.
|
|
|
|
if (CI && CI->isInlineAsm())
|
|
|
|
continue;
|
|
|
|
// Skip direct calls.
|
|
|
|
if (!CS.getCalledValue() || isa<Constant>(CS.getCalledValue()))
|
|
|
|
continue;
|
2016-04-11 21:58:45 +08:00
|
|
|
|
2016-08-30 08:46:26 +08:00
|
|
|
uint32_t NumVals, NumCandidates;
|
|
|
|
uint64_t TotalCount;
|
|
|
|
auto CandidateProfileData =
|
|
|
|
ICallAnalysis.getPromotionCandidatesForInstruction(
|
|
|
|
&I, NumVals, TotalCount, NumCandidates);
|
|
|
|
for (auto &Candidate : CandidateProfileData)
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
IndirectCallEdges[Candidate.Value].updateHotness(
|
|
|
|
getHotness(Candidate.Count, PSI));
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-24 11:18:18 +08:00
|
|
|
GlobalValueSummary::GVFlags Flags(F);
|
2016-04-11 21:58:45 +08:00
|
|
|
std::unique_ptr<FunctionSummary> FuncSummary =
|
2016-04-24 11:18:18 +08:00
|
|
|
llvm::make_unique<FunctionSummary>(Flags, NumInsts);
|
2016-04-11 21:58:45 +08:00
|
|
|
FuncSummary->addCallGraphEdges(CallGraphEdges);
|
2016-07-17 22:47:01 +08:00
|
|
|
FuncSummary->addCallGraphEdges(IndirectCallEdges);
|
2016-04-11 21:58:45 +08:00
|
|
|
FuncSummary->addRefEdges(RefEdges);
|
2016-08-19 15:49:19 +08:00
|
|
|
Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
|
|
|
|
2016-08-19 15:49:19 +08:00
|
|
|
static void computeVariableSummary(ModuleSummaryIndex &Index,
|
|
|
|
const GlobalVariable &V) {
|
2016-04-11 21:58:45 +08:00
|
|
|
DenseSet<const Value *> RefEdges;
|
|
|
|
SmallPtrSet<const User *, 8> Visited;
|
|
|
|
findRefEdges(&V, RefEdges, Visited);
|
2016-04-24 11:18:18 +08:00
|
|
|
GlobalValueSummary::GVFlags Flags(V);
|
2016-04-11 21:58:45 +08:00
|
|
|
std::unique_ptr<GlobalVarSummary> GVarSummary =
|
2016-04-24 11:18:18 +08:00
|
|
|
llvm::make_unique<GlobalVarSummary>(Flags);
|
2016-04-11 21:58:45 +08:00
|
|
|
GVarSummary->addRefEdges(RefEdges);
|
2016-08-19 15:49:19 +08:00
|
|
|
Index.addGlobalValueSummary(V.getName(), std::move(GVarSummary));
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
|
|
|
|
2016-08-19 15:49:19 +08:00
|
|
|
ModuleSummaryIndex llvm::buildModuleSummaryIndex(
|
|
|
|
const Module &M,
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
|
|
|
|
ProfileSummaryInfo *PSI) {
|
2016-08-19 15:49:19 +08:00
|
|
|
ModuleSummaryIndex Index;
|
2016-05-06 16:25:33 +08:00
|
|
|
// Check if the module can be promoted, otherwise just disable importing from
|
|
|
|
// it by not emitting any summary.
|
|
|
|
// FIXME: we could still import *into* it most of the time.
|
2016-08-19 15:49:19 +08:00
|
|
|
if (!moduleCanBeRenamedForThinLTO(M))
|
|
|
|
return Index;
|
2016-04-20 22:39:45 +08:00
|
|
|
|
2016-04-11 21:58:45 +08:00
|
|
|
// Compute summaries for all functions defined in module, and save in the
|
|
|
|
// index.
|
2016-08-19 15:49:19 +08:00
|
|
|
for (auto &F : M) {
|
2016-04-11 21:58:45 +08:00
|
|
|
if (F.isDeclaration())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
BlockFrequencyInfo *BFI = nullptr;
|
|
|
|
std::unique_ptr<BlockFrequencyInfo> BFIPtr;
|
2016-08-19 15:49:19 +08:00
|
|
|
if (GetBFICallback)
|
|
|
|
BFI = GetBFICallback(F);
|
2016-04-11 21:58:45 +08:00
|
|
|
else if (F.getEntryCount().hasValue()) {
|
|
|
|
LoopInfo LI{DominatorTree(const_cast<Function &>(F))};
|
|
|
|
BranchProbabilityInfo BPI{F, LI};
|
|
|
|
BFIPtr = llvm::make_unique<BlockFrequencyInfo>(F, BPI, LI);
|
|
|
|
BFI = BFIPtr.get();
|
|
|
|
}
|
|
|
|
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
computeFunctionSummary(Index, M, F, BFI, PSI);
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Compute summaries for all variables defined in module, and save in the
|
|
|
|
// index.
|
2016-08-19 15:49:19 +08:00
|
|
|
for (const GlobalVariable &G : M.globals()) {
|
2016-04-11 21:58:45 +08:00
|
|
|
if (G.isDeclaration())
|
|
|
|
continue;
|
2016-08-19 15:49:19 +08:00
|
|
|
computeVariableSummary(Index, G);
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
2016-08-19 15:49:19 +08:00
|
|
|
return Index;
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
|
|
|
|
2016-08-12 21:53:02 +08:00
|
|
|
char ModuleSummaryIndexAnalysis::PassID;
|
|
|
|
|
2016-08-19 15:49:19 +08:00
|
|
|
ModuleSummaryIndex
|
2016-08-12 21:53:02 +08:00
|
|
|
ModuleSummaryIndexAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
|
2016-08-12 21:53:02 +08:00
|
|
|
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
return buildModuleSummaryIndex(
|
|
|
|
M,
|
|
|
|
[&FAM](const Function &F) {
|
|
|
|
return &FAM.getResult<BlockFrequencyAnalysis>(
|
|
|
|
*const_cast<Function *>(&F));
|
|
|
|
},
|
|
|
|
&PSI);
|
2016-08-12 21:53:02 +08:00
|
|
|
}
|
|
|
|
|
2016-04-11 21:58:45 +08:00
|
|
|
char ModuleSummaryIndexWrapperPass::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
|
|
|
|
"Module Summary Analysis", false, true)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
|
|
|
|
INITIALIZE_PASS_END(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
|
|
|
|
"Module Summary Analysis", false, true)
|
|
|
|
|
|
|
|
ModulePass *llvm::createModuleSummaryIndexWrapperPass() {
|
|
|
|
return new ModuleSummaryIndexWrapperPass();
|
|
|
|
}
|
|
|
|
|
|
|
|
ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass()
|
|
|
|
: ModulePass(ID) {
|
|
|
|
initializeModuleSummaryIndexWrapperPassPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
|
2016-09-29 05:00:58 +08:00
|
|
|
auto &PSI = *getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
Index = buildModuleSummaryIndex(
|
|
|
|
M,
|
|
|
|
[this](const Function &F) {
|
|
|
|
return &(this->getAnalysis<BlockFrequencyInfoWrapperPass>(
|
|
|
|
*const_cast<Function *>(&F))
|
|
|
|
.getBFI());
|
|
|
|
},
|
|
|
|
&PSI);
|
2016-04-11 21:58:45 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ModuleSummaryIndexWrapperPass::doFinalization(Module &M) {
|
2016-08-19 15:49:19 +08:00
|
|
|
Index.reset();
|
2016-04-11 21:58:45 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ModuleSummaryIndexWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
|
|
AU.setPreservesAll();
|
|
|
|
AU.addRequired<BlockFrequencyInfoWrapperPass>();
|
[thinlto] Basic thinlto fdo heuristic
Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.
I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.
I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.
Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.
Reviewers: eraman, mehdi_amini, tejohnson
Subscribers: mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D24638
llvm-svn: 282437
2016-09-27 04:37:32 +08:00
|
|
|
AU.addRequired<ProfileSummaryInfoWrapperPass>();
|
2016-04-11 21:58:45 +08:00
|
|
|
}
|
2016-05-06 16:25:33 +08:00
|
|
|
|
|
|
|
bool llvm::moduleCanBeRenamedForThinLTO(const Module &M) {
|
|
|
|
// We cannot currently promote or rename anything used in inline assembly,
|
|
|
|
// which are not visible to the compiler. Detect a possible case by looking
|
|
|
|
// for a llvm.used local value, in conjunction with an inline assembly call
|
|
|
|
// in the module. Prevent importing of any modules containing these uses by
|
|
|
|
// suppressing generation of the index. This also prevents importing
|
|
|
|
// into this module, which is also necessary to avoid needing to rename
|
|
|
|
// in case of a name clash between a local in this module and an imported
|
|
|
|
// global.
|
|
|
|
// FIXME: If we find we need a finer-grained approach of preventing promotion
|
|
|
|
// and renaming of just the functions using inline assembly we will need to:
|
|
|
|
// - Add flag in the function summaries to identify those with inline asm.
|
|
|
|
// - Prevent importing of any functions with flag set.
|
|
|
|
// - Prevent importing of any global function with the same name as a
|
|
|
|
// function in current module that has the flag set.
|
|
|
|
// - For any llvm.used value that is exported and promoted, add a private
|
|
|
|
// alias to the original name in the current module (even if we don't
|
|
|
|
// export the function using those values in inline asm, another function
|
|
|
|
// with a reference could be exported).
|
|
|
|
SmallPtrSet<GlobalValue *, 8> Used;
|
|
|
|
collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
|
|
|
|
bool LocalIsUsed =
|
2016-08-12 05:15:00 +08:00
|
|
|
any_of(Used, [](GlobalValue *V) { return V->hasLocalLinkage(); });
|
2016-05-06 16:25:33 +08:00
|
|
|
if (!LocalIsUsed)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Walk all the instructions in the module and find if one is inline ASM
|
2016-08-12 05:15:00 +08:00
|
|
|
auto HasInlineAsm = any_of(M, [](const Function &F) {
|
|
|
|
return any_of(instructions(F), [](const Instruction &I) {
|
2016-05-06 16:25:33 +08:00
|
|
|
const CallInst *CallI = dyn_cast<CallInst>(&I);
|
|
|
|
if (!CallI)
|
|
|
|
return false;
|
|
|
|
return CallI->isInlineAsm();
|
|
|
|
});
|
|
|
|
});
|
|
|
|
return !HasInlineAsm;
|
|
|
|
}
|