forked from OSchip/llvm-project
[ThinLTO] Perform profile-guided indirect call promotion
Summary: To enable profile-guided indirect call promotion in ThinLTO mode, we simply add call graph edges for each profitable target from the profile to the summaries, then the summary-guided importing will consider the callee for importing as usual. Also we need to enable the indirect call promotion pass creation in the PassManagerBuilder when PerformThinLTO=true (we are in the ThinLTO backend), so that the newly imported functions are considered for promotion in the backends. The IC promotion profiles refer to callees by GUID, which required adding GUIDs to the per-module VST in bitcode (and assigning them valueIds similar to how they are assigned valueIds in the combined index). Reviewers: mehdi_amini, xur Subscribers: mehdi_amini, davidxl, llvm-commits Differential Revision: http://reviews.llvm.org/D21932 llvm-svn: 275707
This commit is contained in:
parent
ce7de9b6fb
commit
cd21a646f6
|
@ -80,6 +80,7 @@ struct ValueInfo {
|
|||
assert(Kind == VI_Value && "Not a Value type");
|
||||
return TheValue.V;
|
||||
}
|
||||
bool isGUID() const { return Kind == VI_GUID; }
|
||||
};
|
||||
|
||||
/// \brief Function and variable summary information to aid decisions and
|
||||
|
@ -261,6 +262,14 @@ public:
|
|||
CallGraphEdgeList.push_back(std::make_pair(CalleeGUID, Info));
|
||||
}
|
||||
|
||||
/// Record a call graph edge from this function to each function GUID recorded
|
||||
/// in \p CallGraphEdges.
|
||||
void
|
||||
addCallGraphEdges(DenseMap<GlobalValue::GUID, CalleeInfo> &CallGraphEdges) {
|
||||
for (auto &EI : CallGraphEdges)
|
||||
addCallGraphEdge(EI.first, EI.second);
|
||||
}
|
||||
|
||||
/// Record a call graph edge from this function to the function identified
|
||||
/// by \p CalleeV, with \p CalleeInfo including the cumulative profile
|
||||
/// count (across all calls from this function) or 0 if no PGO.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "llvm/Analysis/BlockFrequencyInfo.h"
|
||||
#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
|
||||
#include "llvm/Analysis/BranchProbabilityInfo.h"
|
||||
#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/IR/CallSite.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
|
@ -73,7 +74,9 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary(
|
|||
// Map from callee ValueId to profile count. Used to accumulate profile
|
||||
// counts for all static calls to a given callee.
|
||||
DenseMap<const Value *, CalleeInfo> CallGraphEdges;
|
||||
DenseMap<GlobalValue::GUID, CalleeInfo> IndirectCallEdges;
|
||||
DenseSet<const Value *> RefEdges;
|
||||
ICallPromotionAnalysis ICallAnalysis;
|
||||
|
||||
SmallPtrSet<const User *, 8> Visited;
|
||||
for (const BasicBlock &BB : F)
|
||||
|
@ -83,13 +86,29 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary(
|
|||
|
||||
if (auto CS = ImmutableCallSite(&I)) {
|
||||
auto *CalledFunction = CS.getCalledFunction();
|
||||
if (CalledFunction && CalledFunction->hasName() &&
|
||||
!CalledFunction->isIntrinsic()) {
|
||||
auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
|
||||
auto *CalleeId =
|
||||
M->getValueSymbolTable().lookup(CalledFunction->getName());
|
||||
CallGraphEdges[CalleeId] +=
|
||||
(ScaledCount ? ScaledCount.getValue() : 0);
|
||||
// Check if this is a direct call to a known function.
|
||||
if (CalledFunction) {
|
||||
if (CalledFunction->hasName() && !CalledFunction->isIntrinsic()) {
|
||||
auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
|
||||
auto *CalleeId =
|
||||
M->getValueSymbolTable().lookup(CalledFunction->getName());
|
||||
CallGraphEdges[CalleeId] +=
|
||||
(ScaledCount ? ScaledCount.getValue() : 0);
|
||||
}
|
||||
} else {
|
||||
// Otherwise, check for an indirect call (call to a non-const value
|
||||
// that isn't an inline assembly call).
|
||||
const CallInst *CI = dyn_cast<CallInst>(&I);
|
||||
if (CS.getCalledValue() && !isa<Constant>(CS.getCalledValue()) &&
|
||||
!(CI && CI->isInlineAsm())) {
|
||||
uint32_t NumVals, NumCandidates;
|
||||
uint64_t TotalCount;
|
||||
auto CandidateProfileData =
|
||||
ICallAnalysis.getPromotionCandidatesForInstruction(
|
||||
&I, NumVals, TotalCount, NumCandidates);
|
||||
for (auto &Candidate : CandidateProfileData)
|
||||
IndirectCallEdges[Candidate.Value] += Candidate.Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
findRefEdges(&I, RefEdges, Visited);
|
||||
|
@ -99,6 +118,7 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary(
|
|||
std::unique_ptr<FunctionSummary> FuncSummary =
|
||||
llvm::make_unique<FunctionSummary>(Flags, NumInsts);
|
||||
FuncSummary->addCallGraphEdges(CallGraphEdges);
|
||||
FuncSummary->addCallGraphEdges(IndirectCallEdges);
|
||||
FuncSummary->addRefEdges(RefEdges);
|
||||
Index->addGlobalValueSummary(F.getName(), std::move(FuncSummary));
|
||||
}
|
||||
|
|
|
@ -119,6 +119,14 @@ class ModuleBitcodeWriter : public BitcodeWriter {
|
|||
/// The start bit of the module block, for use in generating a module hash
|
||||
uint64_t BitcodeStartBit = 0;
|
||||
|
||||
/// Map that holds the correspondence between GUIDs in the summary index,
|
||||
/// that came from indirect call profiles, and a value id generated by this
|
||||
/// class to use in the VST and summary block records.
|
||||
std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
|
||||
|
||||
/// Tracks the last value id recorded in the GUIDToValueMap.
|
||||
unsigned GlobalValueId;
|
||||
|
||||
public:
|
||||
/// Constructs a ModuleBitcodeWriter object for the given Module,
|
||||
/// writing to the provided \p Buffer.
|
||||
|
@ -132,6 +140,25 @@ public:
|
|||
// will start at the bitcode, and we need the offset of the VST
|
||||
// to line up.
|
||||
BitcodeStartBit = Stream.GetCurrentBitNo();
|
||||
|
||||
// Assign ValueIds to any callee values in the index that came from
|
||||
// indirect call profiles and were recorded as a GUID not a Value*
|
||||
// (which would have been assigned an ID by the ValueEnumerator).
|
||||
// The starting ValueId is just after the number of values in the
|
||||
// ValueEnumerator, so that they can be emitted in the VST.
|
||||
GlobalValueId = VE.getValues().size();
|
||||
if (Index)
|
||||
for (const auto &GUIDSummaryLists : *Index)
|
||||
// Examine all summaries for this GUID.
|
||||
for (auto &Summary : GUIDSummaryLists.second)
|
||||
if (auto FS = dyn_cast<FunctionSummary>(Summary.get()))
|
||||
// For each call in the function summary, see if the call
|
||||
// is to a GUID (which means it is for an indirect call,
|
||||
// otherwise we would have a Value for it). If so, synthesize
|
||||
// a value id.
|
||||
for (auto &CallEdge : FS->calls())
|
||||
if (CallEdge.first.isGUID())
|
||||
assignValueId(CallEdge.first.getGUID());
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -260,6 +287,22 @@ private:
|
|||
unsigned FSModRefsAbbrev);
|
||||
void writePerModuleGlobalValueSummary();
|
||||
void writeModuleHash(size_t BlockStartPos);
|
||||
|
||||
void assignValueId(GlobalValue::GUID ValGUID) {
|
||||
GUIDToValueIdMap[ValGUID] = ++GlobalValueId;
|
||||
}
|
||||
unsigned getValueId(GlobalValue::GUID ValGUID) {
|
||||
const auto &VMI = GUIDToValueIdMap.find(ValGUID);
|
||||
assert(VMI != GUIDToValueIdMap.end());
|
||||
return VMI->second;
|
||||
}
|
||||
// Helper to get the valueId for the type of value recorded in VI.
|
||||
unsigned getValueId(ValueInfo VI) {
|
||||
if (VI.isGUID())
|
||||
return getValueId(VI.getGUID());
|
||||
return VE.getValueID(VI.getValue());
|
||||
}
|
||||
std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
|
||||
};
|
||||
|
||||
/// Class to manage the bitcode writing for a combined index.
|
||||
|
@ -2707,6 +2750,7 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
|
|||
unsigned FnEntry8BitAbbrev;
|
||||
unsigned FnEntry7BitAbbrev;
|
||||
unsigned FnEntry6BitAbbrev;
|
||||
unsigned GUIDEntryAbbrev;
|
||||
if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
|
||||
// 8-bit fixed-width VST_CODE_FNENTRY function strings.
|
||||
BitCodeAbbrev *Abbv = new BitCodeAbbrev();
|
||||
|
@ -2734,11 +2778,19 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
|
|||
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
|
||||
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
|
||||
FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
|
||||
|
||||
// FIXME: Change the name of this record as it is now used by
|
||||
// the per-module index as well.
|
||||
Abbv = new BitCodeAbbrev();
|
||||
Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
|
||||
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
|
||||
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
|
||||
GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv);
|
||||
}
|
||||
|
||||
// FIXME: Set up the abbrev, we know how many values there are!
|
||||
// FIXME: We know if the type names can use 7-bit ascii.
|
||||
SmallVector<unsigned, 64> NameVals;
|
||||
SmallVector<uint64_t, 64> NameVals;
|
||||
|
||||
for (const ValueName &Name : VST) {
|
||||
// Figure out the encoding to use for the name.
|
||||
|
@ -2799,6 +2851,16 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
|
|||
Stream.EmitRecord(Code, NameVals, AbbrevToUse);
|
||||
NameVals.clear();
|
||||
}
|
||||
// Emit any GUID valueIDs created for indirect call edges into the
|
||||
// module-level VST.
|
||||
if (IsModuleLevel && hasVSTOffsetPlaceholder())
|
||||
for (const auto &GI : valueIds()) {
|
||||
NameVals.push_back(GI.second);
|
||||
NameVals.push_back(GI.first);
|
||||
Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals,
|
||||
GUIDEntryAbbrev);
|
||||
NameVals.clear();
|
||||
}
|
||||
Stream.ExitBlock();
|
||||
}
|
||||
|
||||
|
@ -3220,12 +3282,11 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
|
|||
std::sort(Calls.begin(), Calls.end(),
|
||||
[this](const FunctionSummary::EdgeTy &L,
|
||||
const FunctionSummary::EdgeTy &R) {
|
||||
return VE.getValueID(L.first.getValue()) <
|
||||
VE.getValueID(R.first.getValue());
|
||||
return getValueId(L.first) < getValueId(R.first);
|
||||
});
|
||||
bool HasProfileData = F.getEntryCount().hasValue();
|
||||
for (auto &ECI : Calls) {
|
||||
NameVals.push_back(VE.getValueID(ECI.first.getValue()));
|
||||
NameVals.push_back(getValueId(ECI.first));
|
||||
assert(ECI.second.CallsiteCount > 0 && "Expected at least one callsite");
|
||||
NameVals.push_back(ECI.second.CallsiteCount);
|
||||
if (HasProfileData)
|
||||
|
|
|
@ -407,10 +407,11 @@ void PassManagerBuilder::populateModulePassManager(
|
|||
/// PGO instrumentation is added during the compile phase for ThinLTO, do
|
||||
/// not run it a second time
|
||||
addPGOInstrPasses(MPM);
|
||||
// Indirect call promotion that promotes intra-module targets only.
|
||||
MPM.add(createPGOIndirectCallPromotionLegacyPass());
|
||||
}
|
||||
|
||||
// Indirect call promotion that promotes intra-module targets only.
|
||||
MPM.add(createPGOIndirectCallPromotionLegacyPass());
|
||||
|
||||
if (EnableNonLTOGlobalsModRef)
|
||||
// We add a module alias analysis pass here. In part due to bugs in the
|
||||
// analysis infrastructure this "works" in that the analysis stays alive
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define void @a() {
|
||||
entry:
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
; Do setup work for all below tests: generate bitcode and combined index
|
||||
; RUN: opt -module-summary %s -o %t.bc
|
||||
; RUN: opt -module-summary %p/Inputs/thinlto_indirect_call_promotion.ll -o %t2.bc
|
||||
; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
|
||||
|
||||
; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -o %t4.bc -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS
|
||||
; IMPORTS: Import a
|
||||
|
||||
; RUN: opt %t4.bc -pgo-icall-prom -S -icp-count-threshold=1 | FileCheck %s --check-prefix=ICALL-PROM
|
||||
; RUN: opt %t4.bc -pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=1 2>&1 | FileCheck %s --check-prefix=PASS-REMARK
|
||||
; PASS-REMARK: Promote indirect call to a with count 1 out of 1
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
@foo = external local_unnamed_addr global void ()*, align 8
|
||||
|
||||
define i32 @main() local_unnamed_addr {
|
||||
entry:
|
||||
%0 = load void ()*, void ()** @foo, align 8
|
||||
; ICALL-PROM: br i1 %{{[0-9]+}}, label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
|
||||
tail call void %0(), !prof !1
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
!1 = !{!"VP", i32 0, i64 1, i64 -6289574019528802036, i64 1}
|
||||
|
||||
; Should not have a VP annotation on new indirect call (check before and after
|
||||
; branch_weights annotation).
|
||||
; ICALL-PROM-NOT: !"VP"
|
||||
; ICALL-PROM: [[BRANCH_WEIGHT]] = !{!"branch_weights", i32 1, i32 0}
|
||||
; ICALL-PROM-NOT: !"VP"
|
Loading…
Reference in New Issue