forked from OSchip/llvm-project
HFSort/call graph refactoring
Summary: I've factored out the call graph code from dataflow and function reordering code and done a few small renames/cleanups. I've also moved the function reordering pass into a separate file because it was starting to get big. I've got more refactoring planned for hfsort/call graph but this is a start. (cherry picked from FBD5140771)
This commit is contained in:
parent
9b190cc74b
commit
733e8c464f
|
@ -13,6 +13,7 @@
|
|||
#include "Passes/FrameOptimizer.h"
|
||||
#include "Passes/IndirectCallPromotion.h"
|
||||
#include "Passes/Inliner.h"
|
||||
#include "Passes/ReorderFunctions.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <numeric>
|
||||
|
|
|
@ -10,11 +10,8 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "HFSort.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#define DEBUG_TYPE "bolt"
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -52,11 +49,9 @@ namespace opts {
|
|||
extern cl::OptionCategory BoltOptCategory;
|
||||
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
extern cl::opt<uint32_t> RandomSeed;
|
||||
extern cl::opt<bool> Relocs;
|
||||
extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions;
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
extern size_t padFunction(const bolt::BinaryFunction &Function);
|
||||
|
||||
enum DynoStatsSortOrder : char {
|
||||
Ascending,
|
||||
|
@ -71,18 +66,6 @@ DynoStatsSortOrderOpt("print-sorted-by-order",
|
|||
cl::init(DynoStatsSortOrder::Descending),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
FunctionOrderFile("function-order",
|
||||
cl::desc("file containing an ordered list of functions to use for function "
|
||||
"reordering"),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
GenerateFunctionOrderFile("generate-function-order",
|
||||
cl::desc("file to dump the ordered list of functions to use for function "
|
||||
"reordering"),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
ICFUseDFS("icf-dfs",
|
||||
cl::desc("use DFS ordering when using -icf option"),
|
||||
|
@ -143,41 +126,6 @@ ReorderBlocks("reorder-blocks",
|
|||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<bolt::BinaryFunction::ReorderType>
|
||||
ReorderFunctions("reorder-functions",
|
||||
cl::desc("reorder and cluster functions (works only with relocations)"),
|
||||
cl::init(bolt::BinaryFunction::RT_NONE),
|
||||
cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE,
|
||||
"none",
|
||||
"do not reorder functions"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT,
|
||||
"exec-count",
|
||||
"order by execution count"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_HFSORT,
|
||||
"hfsort",
|
||||
"use hfsort algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS,
|
||||
"hfsort+",
|
||||
"use hfsort+ algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN,
|
||||
"pettis-hansen",
|
||||
"use Pettis-Hansen algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_RANDOM,
|
||||
"random",
|
||||
"reorder functions randomly"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_USER,
|
||||
"user",
|
||||
"use function order specified by -function-order"),
|
||||
clEnumValEnd),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
ReorderFunctionsUseHotSize("reorder-functions-use-hot-size",
|
||||
cl::desc("use a function's hot size when doing clustering"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
enum SctcModes : char {
|
||||
SctcAlways,
|
||||
SctcPreserveDirection,
|
||||
|
@ -200,13 +148,6 @@ SctcMode("sctc-mode",
|
|||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
UseEdgeCounts("use-edge-counts",
|
||||
cl::desc("use edge count data when doing clustering"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
|
@ -1177,424 +1118,5 @@ void StripRepRet::runOnFunctions(
|
|||
}
|
||||
}
|
||||
|
||||
void ReorderFunctions::buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
// Add call graph nodes.
|
||||
auto lookupNode = [&](BinaryFunction *Function) {
|
||||
auto It = FuncToTargetId.find(Function);
|
||||
if (It == FuncToTargetId.end()) {
|
||||
// It's ok to use the hot size here when the function is split. This is
|
||||
// because emitFunctions will emit the hot part first in the order that is
|
||||
// computed by ReorderFunctions. The cold part will be emitted with the
|
||||
// rest of the cold functions and code.
|
||||
const auto Size = opts::ReorderFunctionsUseHotSize && Function->isSplit()
|
||||
? Function->estimateHotSize()
|
||||
: Function->estimateSize();
|
||||
const auto Id = Cg.addTarget(Size);
|
||||
assert(size_t(Id) == Funcs.size());
|
||||
Funcs.push_back(Function);
|
||||
FuncToTargetId[Function] = Id;
|
||||
// NOTE: for functions without a profile, we set the number of samples
|
||||
// to zero. This will keep these functions from appearing in the hot
|
||||
// section. This is a little weird because we wouldn't be trying to
|
||||
// create a node for a function unless it was the target of a call from
|
||||
// a hot block. The alternative would be to set the count to one or
|
||||
// accumulate the number of calls from the callsite into the function
|
||||
// samples. Results from perfomance testing seem to favor the zero
|
||||
// count though, so I'm leaving it this way for now.
|
||||
Cg.Targets[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0;
|
||||
assert(Funcs[Id] == Function);
|
||||
return Id;
|
||||
} else {
|
||||
return It->second;
|
||||
}
|
||||
};
|
||||
|
||||
// Add call graph edges.
|
||||
uint64_t NotProcessed = 0;
|
||||
uint64_t TotalCalls = 0;
|
||||
for (auto &It : BFs) {
|
||||
auto *Function = &It.second;
|
||||
|
||||
if(!shouldOptimize(*Function) || !Function->hasProfile()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
|
||||
const auto SrcId = lookupNode(Function);
|
||||
uint64_t Offset = Function->getAddress();
|
||||
|
||||
auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
|
||||
if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
|
||||
const auto DstId = lookupNode(DstFunc);
|
||||
auto &A = Cg.incArcWeight(SrcId, DstId, Count);
|
||||
if (!opts::UseEdgeCounts) {
|
||||
A.AvgCallOffset += (Offset - DstFunc->getAddress());
|
||||
}
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: Reorder functions: call " << *Function
|
||||
<< " -> " << *DstFunc << " @ " << Offset << "\n");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (auto *BB : Function->layout()) {
|
||||
// Don't count calls from cold blocks
|
||||
if (BB->isCold())
|
||||
continue;
|
||||
|
||||
for (auto &Inst : *BB) {
|
||||
// Find call instructions and extract target symbols from each one.
|
||||
if (BC.MIA->isCall(Inst)) {
|
||||
++TotalCalls;
|
||||
if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
|
||||
// For direct calls, just use the BB execution count.
|
||||
assert(BB->hasProfile());
|
||||
const auto Count = opts::UseEdgeCounts ? BB->getExecutionCount() : 1;
|
||||
if (!recordCall(DstSym, Count))
|
||||
++NotProcessed;
|
||||
} else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
|
||||
// For indirect calls and jump tables, use branch data.
|
||||
assert(BranchDataOrErr);
|
||||
const FuncBranchData &BranchData = BranchDataOrErr.get();
|
||||
const auto DataOffset =
|
||||
BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
|
||||
|
||||
for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
|
||||
// Count each target as a separate call.
|
||||
++TotalCalls;
|
||||
|
||||
if (!BI.To.IsSymbol) {
|
||||
++NotProcessed;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto Itr = BC.GlobalSymbols.find(BI.To.Name);
|
||||
if (Itr == BC.GlobalSymbols.end()) {
|
||||
++NotProcessed;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto *DstSym =
|
||||
BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
|
||||
|
||||
if (!recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1))
|
||||
++NotProcessed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!opts::UseEdgeCounts) {
|
||||
Offset += BC.computeCodeSize(&Inst, &Inst + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
outs() << "BOLT-WARNING: ReorderFunctions: " << NotProcessed
|
||||
<< " callsites not processed out of " << TotalCalls << "\n";
|
||||
|
||||
// Normalize arc weights.
|
||||
if (!opts::UseEdgeCounts) {
|
||||
for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) {
|
||||
auto& Func = Cg.Targets[FuncId];
|
||||
for (auto Caller : Func.Preds) {
|
||||
auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
|
||||
A.NormalizedWeight = A.Weight / Func.Samples;
|
||||
A.AvgCallOffset /= A.Weight;
|
||||
assert(A.AvgCallOffset < Cg.Targets[Caller].Size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) {
|
||||
auto &Func = Cg.Targets[FuncId];
|
||||
for (auto Caller : Func.Preds) {
|
||||
auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
|
||||
A.NormalizedWeight = A.Weight / Func.Samples;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
std::vector<uint64_t> FuncAddr(Cg.Targets.size()); // Just for computing stats
|
||||
uint64_t TotalSize = 0;
|
||||
uint32_t Index = 0;
|
||||
|
||||
// Set order of hot functions based on clusters.
|
||||
for (const auto& Cluster : Clusters) {
|
||||
for (const auto FuncId : Cluster.Targets) {
|
||||
assert(Cg.Targets[FuncId].Samples > 0);
|
||||
Funcs[FuncId]->setIndex(Index++);
|
||||
FuncAddr[FuncId] = TotalSize;
|
||||
TotalSize += Cg.Targets[FuncId].Size;
|
||||
}
|
||||
}
|
||||
|
||||
if (opts::ReorderFunctions == BinaryFunction::RT_NONE)
|
||||
return;
|
||||
|
||||
if (opts::Verbosity == 0) {
|
||||
#ifndef NDEBUG
|
||||
if (!DebugFlag || !isCurrentDebugType("hfsort"))
|
||||
return;
|
||||
#else
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
TotalSize = 0;
|
||||
uint64_t CurPage = 0;
|
||||
uint64_t Hotfuncs = 0;
|
||||
double TotalDistance = 0;
|
||||
double TotalCalls = 0;
|
||||
double TotalCalls64B = 0;
|
||||
double TotalCalls4KB = 0;
|
||||
double TotalCalls2MB = 0;
|
||||
dbgs() << "============== page 0 ==============\n";
|
||||
for (auto& Cluster : Clusters) {
|
||||
dbgs() <<
|
||||
format("-------- density = %.3lf (%u / %u) --------\n",
|
||||
(double) Cluster.Samples / Cluster.Size,
|
||||
Cluster.Samples, Cluster.Size);
|
||||
|
||||
for (auto FuncId : Cluster.Targets) {
|
||||
if (Cg.Targets[FuncId].Samples > 0) {
|
||||
Hotfuncs++;
|
||||
|
||||
dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId]
|
||||
<< " (" << Cg.Targets[FuncId].Size << ")\n";
|
||||
|
||||
uint64_t Dist = 0;
|
||||
uint64_t Calls = 0;
|
||||
for (auto Dst : Cg.Targets[FuncId].Succs) {
|
||||
auto& A = *Cg.Arcs.find(Arc(FuncId, Dst));
|
||||
auto D =
|
||||
std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset));
|
||||
auto W = A.Weight;
|
||||
Calls += W;
|
||||
if (D < 64) TotalCalls64B += W;
|
||||
if (D < 4096) TotalCalls4KB += W;
|
||||
if (D < (2 << 20)) TotalCalls2MB += W;
|
||||
Dist += A.Weight * D;
|
||||
dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
|
||||
"weight = %.0lf, callDist = %f\n",
|
||||
A.Src, FuncAddr[A.Src], A.AvgCallOffset,
|
||||
A.Dst, FuncAddr[A.Dst], A.Weight, D);
|
||||
}
|
||||
TotalCalls += Calls;
|
||||
TotalDistance += Dist;
|
||||
dbgs() << format("start = %6u : avgCallDist = %lu : %s\n",
|
||||
TotalSize,
|
||||
Calls ? Dist / Calls : 0,
|
||||
Funcs[FuncId]->getPrintName().c_str());
|
||||
TotalSize += Cg.Targets[FuncId].Size;
|
||||
auto NewPage = TotalSize / PageSize;
|
||||
if (NewPage != CurPage) {
|
||||
CurPage = NewPage;
|
||||
dbgs() << format("============== page %u ==============\n", CurPage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dbgs() << format(" Number of hot functions: %u\n"
|
||||
" Number of clusters: %lu\n",
|
||||
Hotfuncs, Clusters.size())
|
||||
<< format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n",
|
||||
TotalCalls ? TotalDistance / TotalCalls : 0,
|
||||
TotalDistance, TotalCalls)
|
||||
<< format(" Total Calls = %.0lf\n", TotalCalls);
|
||||
if (TotalCalls) {
|
||||
dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
|
||||
<< format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
|
||||
<< format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<std::string> readFunctionOrderFile() {
|
||||
std::vector<std::string> FunctionNames;
|
||||
std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
|
||||
if (!FuncsFile) {
|
||||
errs() << "Ordered functions file \"" << opts::FunctionOrderFile
|
||||
<< "\" can't be opened.\n";
|
||||
exit(1);
|
||||
}
|
||||
std::string FuncName;
|
||||
while (std::getline(FuncsFile, FuncName)) {
|
||||
FunctionNames.push_back(FuncName);
|
||||
}
|
||||
return FunctionNames;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ReorderFunctions::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) {
|
||||
errs() << "BOLT-ERROR: Function reordering only works when "
|
||||
<< "relocs are enabled.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (opts::ReorderFunctions != BinaryFunction::RT_NONE &&
|
||||
opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT &&
|
||||
opts::ReorderFunctions != BinaryFunction::RT_USER) {
|
||||
buildCallGraph(BC, BFs);
|
||||
}
|
||||
|
||||
std::vector<Cluster> Clusters;
|
||||
|
||||
switch(opts::ReorderFunctions) {
|
||||
case BinaryFunction::RT_NONE:
|
||||
break;
|
||||
case BinaryFunction::RT_EXEC_COUNT:
|
||||
{
|
||||
std::vector<BinaryFunction *> SortedFunctions(BFs.size());
|
||||
uint32_t Index = 0;
|
||||
std::transform(BFs.begin(),
|
||||
BFs.end(),
|
||||
SortedFunctions.begin(),
|
||||
[](std::pair<const uint64_t, BinaryFunction> &BFI) {
|
||||
return &BFI.second;
|
||||
});
|
||||
std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
|
||||
[&](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
if (!opts::shouldProcess(*A))
|
||||
return false;
|
||||
const auto PadA = opts::padFunction(*A);
|
||||
const auto PadB = opts::padFunction(*B);
|
||||
if (!PadA || !PadB) {
|
||||
if (PadA)
|
||||
return true;
|
||||
if (PadB)
|
||||
return false;
|
||||
}
|
||||
return !A->hasProfile() &&
|
||||
(B->hasProfile() ||
|
||||
(A->getExecutionCount() > B->getExecutionCount()));
|
||||
});
|
||||
for (auto *BF : SortedFunctions) {
|
||||
if (BF->hasProfile())
|
||||
BF->setIndex(Index++);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case BinaryFunction::RT_HFSORT:
|
||||
Clusters = clusterize(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_HFSORT_PLUS:
|
||||
Clusters = hfsortPlus(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_PETTIS_HANSEN:
|
||||
Clusters = pettisAndHansen(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_RANDOM:
|
||||
std::srand(opts::RandomSeed);
|
||||
Clusters = randomClusters(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_USER:
|
||||
{
|
||||
uint32_t Index = 0;
|
||||
for (const auto &Function : readFunctionOrderFile()) {
|
||||
std::vector<uint64_t> FuncAddrs;
|
||||
|
||||
auto Itr = BC.GlobalSymbols.find(Function);
|
||||
if (Itr == BC.GlobalSymbols.end()) {
|
||||
uint32_t LocalID = 1;
|
||||
while(1) {
|
||||
// If we can't find the main symbol name, look for alternates.
|
||||
Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID));
|
||||
if (Itr != BC.GlobalSymbols.end())
|
||||
FuncAddrs.push_back(Itr->second);
|
||||
else
|
||||
break;
|
||||
LocalID++;
|
||||
}
|
||||
} else {
|
||||
FuncAddrs.push_back(Itr->second);
|
||||
}
|
||||
|
||||
if (FuncAddrs.empty()) {
|
||||
errs() << "BOLT-WARNING: Reorder functions: can't find function for "
|
||||
<< Function << ".\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto FuncAddr : FuncAddrs) {
|
||||
const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat");
|
||||
assert(FuncSym);
|
||||
|
||||
auto *BF = BC.getFunctionForSymbol(FuncSym);
|
||||
if (!BF) {
|
||||
errs() << "BOLT-WARNING: Reorder functions: can't find function for "
|
||||
<< Function << ".\n";
|
||||
break;
|
||||
}
|
||||
if (!BF->hasValidIndex()) {
|
||||
BF->setIndex(Index++);
|
||||
} else if (opts::Verbosity > 0) {
|
||||
errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
reorder(std::move(Clusters), BFs);
|
||||
|
||||
if (!opts::GenerateFunctionOrderFile.empty()) {
|
||||
std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out);
|
||||
if (!FuncsFile) {
|
||||
errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile
|
||||
<< "\" can't be opened.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::vector<BinaryFunction *> SortedFunctions(BFs.size());
|
||||
|
||||
std::transform(BFs.begin(),
|
||||
BFs.end(),
|
||||
SortedFunctions.begin(),
|
||||
[](std::pair<const uint64_t, BinaryFunction> &BFI) {
|
||||
return &BFI.second;
|
||||
});
|
||||
|
||||
// Sort functions by index.
|
||||
std::stable_sort(
|
||||
SortedFunctions.begin(),
|
||||
SortedFunctions.end(),
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
if (A->hasValidIndex() && B->hasValidIndex()) {
|
||||
return A->getIndex() < B->getIndex();
|
||||
} else if (A->hasValidIndex() && !B->hasValidIndex()) {
|
||||
return true;
|
||||
} else if (!A->hasValidIndex() && B->hasValidIndex()) {
|
||||
return false;
|
||||
} else {
|
||||
return A->getAddress() < B->getAddress();
|
||||
}
|
||||
});
|
||||
|
||||
for (const auto *Func : SortedFunctions) {
|
||||
if (!Func->hasValidIndex())
|
||||
break;
|
||||
FuncsFile << Func->getSymbol()->getName().data() << "\n";
|
||||
}
|
||||
FuncsFile.close();
|
||||
|
||||
outs() << "BOLT-INFO: dumped function order to \""
|
||||
<< opts::GenerateFunctionOrderFile << "\"\n";
|
||||
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "BinaryFunction.h"
|
||||
#include "HFSort.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
@ -358,29 +359,6 @@ public:
|
|||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
/// Modify function order for streaming based on hotness.
|
||||
class ReorderFunctions : public BinaryFunctionPass {
|
||||
static constexpr uint32_t PageSize = 2 << 20;
|
||||
std::vector<BinaryFunction *> Funcs;
|
||||
std::unordered_map<const BinaryFunction *, TargetId> FuncToTargetId;
|
||||
TargetGraph Cg;
|
||||
|
||||
void buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
void reorder(std::vector<Cluster> &&Clusters,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
public:
|
||||
explicit ReorderFunctions(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "reorder-functions";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
add_llvm_library(LLVMBOLTPasses
|
||||
BinaryPasses.cpp
|
||||
CallGraph.cpp
|
||||
DataflowAnalysis.cpp
|
||||
DataflowInfoManager.cpp
|
||||
FrameAnalysis.cpp
|
||||
|
@ -9,7 +10,9 @@ add_llvm_library(LLVMBOLTPasses
|
|||
IndirectCallPromotion.cpp
|
||||
Inliner.cpp
|
||||
LivenessAnalysis.cpp
|
||||
PettisAndHansen.cpp
|
||||
ReorderAlgorithm.cpp
|
||||
ReorderFunctions.cpp
|
||||
StackPointerTracking.cpp
|
||||
)
|
||||
|
||||
|
|
|
@ -0,0 +1,262 @@
|
|||
//===--- Passes/CallGraph.cpp ---------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CallGraph.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "BinaryContext.h"
|
||||
|
||||
#define DEBUG_TYPE "callgraph"
|
||||
|
||||
#if defined(__x86_64__) && !defined(_MSC_VER)
|
||||
# if (!defined USE_SSECRC)
|
||||
# define USE_SSECRC
|
||||
# endif
|
||||
#else
|
||||
# undef USE_SSECRC
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
inline size_t hash_int64_fallback(int64_t key) {
|
||||
// "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
|
||||
// http://www.concentric.net/~ttwang/tech/inthash.htm
|
||||
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
|
||||
key = key ^ ((unsigned long long)key >> 24);
|
||||
key = (key + (key << 3)) + (key << 8); // key * 265
|
||||
key = key ^ ((unsigned long long)key >> 14);
|
||||
key = (key + (key << 2)) + (key << 4); // key * 21
|
||||
key = key ^ ((unsigned long long)key >> 28);
|
||||
return static_cast<size_t>(static_cast<uint32_t>(key));
|
||||
}
|
||||
|
||||
inline size_t hash_int64(int64_t k) {
|
||||
#if defined(USE_SSECRC) && defined(__SSE4_2__)
|
||||
size_t h = 0;
|
||||
__asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
|
||||
return h;
|
||||
#else
|
||||
return hash_int64_fallback(k);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
|
||||
#if defined(USE_SSECRC) && defined(__SSE4_2__)
|
||||
// crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
|
||||
// differently from (k2, k1).
|
||||
k1 += k1;
|
||||
__asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
|
||||
return k1;
|
||||
#else
|
||||
return (hash_int64(k1) << 1) ^ hash_int64(k2);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const {
|
||||
#ifdef USE_STD_HASH
|
||||
std::hash<int64_t> Hasher;
|
||||
return hashCombine(Hasher(Arc.Src), Arc.Dst);
|
||||
#else
|
||||
return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst));
|
||||
#endif
|
||||
}
|
||||
|
||||
CallGraph buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::function<bool (const BinaryFunction &BF)> Filter,
|
||||
bool IncludeColdCalls,
|
||||
bool UseFunctionHotSize,
|
||||
bool UseEdgeCounts) {
|
||||
CallGraph Cg;
|
||||
|
||||
// Add call graph nodes.
|
||||
auto lookupNode = [&](BinaryFunction *Function) {
|
||||
auto It = Cg.FuncToNodeId.find(Function);
|
||||
if (It == Cg.FuncToNodeId.end()) {
|
||||
// It's ok to use the hot size here when the function is split. This is
|
||||
// because emitFunctions will emit the hot part first in the order that is
|
||||
// computed by ReorderFunctions. The cold part will be emitted with the
|
||||
// rest of the cold functions and code.
|
||||
const auto Size = UseFunctionHotSize && Function->isSplit()
|
||||
? Function->estimateHotSize()
|
||||
: Function->estimateSize();
|
||||
const auto Id = Cg.addNode(Size);
|
||||
assert(size_t(Id) == Cg.Funcs.size());
|
||||
Cg.Funcs.push_back(Function);
|
||||
Cg.FuncToNodeId[Function] = Id;
|
||||
// NOTE: for functions without a profile, we set the number of samples
|
||||
// to zero. This will keep these functions from appearing in the hot
|
||||
// section. This is a little weird because we wouldn't be trying to
|
||||
// create a node for a function unless it was the target of a call from
|
||||
// a hot block. The alternative would be to set the count to one or
|
||||
// accumulate the number of calls from the callsite into the function
|
||||
// samples. Results from perfomance testing seem to favor the zero
|
||||
// count though, so I'm leaving it this way for now.
|
||||
Cg.Nodes[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0;
|
||||
assert(Cg.Funcs[Id] == Function);
|
||||
return Id;
|
||||
} else {
|
||||
return It->second;
|
||||
}
|
||||
};
|
||||
|
||||
// Add call graph edges.
|
||||
uint64_t NotProcessed = 0;
|
||||
uint64_t TotalCalls = 0;
|
||||
for (auto &It : BFs) {
|
||||
auto *Function = &It.second;
|
||||
|
||||
if(Filter(*Function)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames());
|
||||
const auto SrcId = lookupNode(Function);
|
||||
uint64_t Offset = Function->getAddress();
|
||||
|
||||
auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
|
||||
if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) {
|
||||
const auto DstId = lookupNode(DstFunc);
|
||||
auto &A = Cg.incArcWeight(SrcId, DstId, Count);
|
||||
if (!UseEdgeCounts) {
|
||||
A.AvgCallOffset += (Offset - DstFunc->getAddress());
|
||||
}
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
|
||||
<< " -> " << *DstFunc << " @ " << Offset << "\n");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (auto *BB : Function->layout()) {
|
||||
// Don't count calls from cold blocks
|
||||
if (BB->isCold() && !IncludeColdCalls)
|
||||
continue;
|
||||
|
||||
for (auto &Inst : *BB) {
|
||||
// Find call instructions and extract target symbols from each one.
|
||||
if (!BC.MIA->isCall(Inst))
|
||||
continue;
|
||||
|
||||
++TotalCalls;
|
||||
if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) {
|
||||
// For direct calls, just use the BB execution count.
|
||||
const auto Count = UseEdgeCounts && BB->hasProfile()
|
||||
? BB->getExecutionCount() : 1;
|
||||
if (!recordCall(DstSym, Count))
|
||||
++NotProcessed;
|
||||
} else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) {
|
||||
// For indirect calls and jump tables, use branch data.
|
||||
if(!BranchDataOrErr) {
|
||||
++NotProcessed;
|
||||
continue;
|
||||
}
|
||||
const FuncBranchData &BranchData = BranchDataOrErr.get();
|
||||
const auto DataOffset =
|
||||
BC.MIA->getAnnotationAs<uint64_t>(Inst, "EdgeCountData");
|
||||
|
||||
for (const auto &BI : BranchData.getBranchRange(DataOffset)) {
|
||||
// Count each target as a separate call.
|
||||
++TotalCalls;
|
||||
|
||||
if (!BI.To.IsSymbol) {
|
||||
++NotProcessed;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto Itr = BC.GlobalSymbols.find(BI.To.Name);
|
||||
if (Itr == BC.GlobalSymbols.end()) {
|
||||
++NotProcessed;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto *DstSym =
|
||||
BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat");
|
||||
|
||||
if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1))
|
||||
++NotProcessed;
|
||||
}
|
||||
}
|
||||
|
||||
if (!UseEdgeCounts) {
|
||||
Offset += BC.computeCodeSize(&Inst, &Inst + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed
|
||||
<< " callsites not processed out of " << TotalCalls << "\n";
|
||||
|
||||
return Cg;
|
||||
}
|
||||
|
||||
CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint32_t Samples) {
|
||||
auto Id = Nodes.size();
|
||||
Nodes.emplace_back(Size, Samples);
|
||||
return Id;
|
||||
}
|
||||
|
||||
const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W) {
|
||||
auto Res = Arcs.emplace(Src, Dst, W);
|
||||
if (!Res.second) {
|
||||
Res.first->Weight += W;
|
||||
return *Res.first;
|
||||
}
|
||||
Nodes[Src].Succs.push_back(Dst);
|
||||
Nodes[Dst].Preds.push_back(Src);
|
||||
return *Res.first;
|
||||
}
|
||||
|
||||
std::deque<BinaryFunction *> CallGraph::buildTraversalOrder() {
|
||||
std::deque<BinaryFunction *> TopologicalOrder;
|
||||
enum NodeStatus { NEW, VISITING, VISITED };
|
||||
std::vector<NodeStatus> NodeStatus(Funcs.size());
|
||||
std::stack<NodeId> Worklist;
|
||||
|
||||
for (auto *Func : Funcs) {
|
||||
const auto Id = FuncToNodeId.at(Func);
|
||||
Worklist.push(Id);
|
||||
NodeStatus[Id] = NEW;
|
||||
}
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
const auto FuncId = Worklist.top();
|
||||
Worklist.pop();
|
||||
|
||||
if (NodeStatus[FuncId] == VISITED)
|
||||
continue;
|
||||
|
||||
if (NodeStatus[FuncId] == VISITING) {
|
||||
TopologicalOrder.push_back(Funcs[FuncId]);
|
||||
NodeStatus[FuncId] = VISITED;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(NodeStatus[FuncId] == NEW);
|
||||
NodeStatus[FuncId] = VISITING;
|
||||
Worklist.push(FuncId);
|
||||
for (const auto Callee : Nodes[FuncId].Succs) {
|
||||
if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
|
||||
continue;
|
||||
Worklist.push(Callee);
|
||||
}
|
||||
}
|
||||
|
||||
return TopologicalOrder;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
//===--- Passes/CallGraph.h -----------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
|
||||
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <deque>
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
class BinaryFunction;
|
||||
class BinaryContext;
|
||||
|
||||
// TODO: find better place for this
|
||||
inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
|
||||
std::hash<int64_t> Hasher;
|
||||
return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
|
||||
}
|
||||
|
||||
/// A call graph class.
|
||||
class CallGraph {
|
||||
public:
|
||||
using NodeId = size_t;
|
||||
static constexpr NodeId InvalidId = -1;
|
||||
|
||||
class Arc {
|
||||
public:
|
||||
struct Hash {
|
||||
int64_t operator()(const Arc &Arc) const;
|
||||
};
|
||||
|
||||
Arc(NodeId S, NodeId D, double W = 0)
|
||||
: Src(S)
|
||||
, Dst(D)
|
||||
, Weight(W)
|
||||
{}
|
||||
Arc(const Arc&) = delete;
|
||||
|
||||
friend bool operator==(const Arc &Lhs, const Arc &Rhs) {
|
||||
return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
|
||||
}
|
||||
|
||||
const NodeId Src;
|
||||
const NodeId Dst;
|
||||
mutable double Weight;
|
||||
mutable double NormalizedWeight{0};
|
||||
mutable double AvgCallOffset{0};
|
||||
};
|
||||
|
||||
class Node {
|
||||
public:
|
||||
explicit Node(uint32_t Size, uint32_t Samples = 0)
|
||||
: Size(Size), Samples(Samples)
|
||||
{}
|
||||
|
||||
uint32_t Size;
|
||||
uint32_t Samples;
|
||||
|
||||
// preds and succs contain no duplicate elements and self arcs are not allowed
|
||||
std::vector<NodeId> Preds;
|
||||
std::vector<NodeId> Succs;
|
||||
};
|
||||
|
||||
NodeId addNode(uint32_t Size, uint32_t Samples = 0);
|
||||
const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0);
|
||||
|
||||
/// Compute a DFS traversal of the call graph.
|
||||
std::deque<BinaryFunction *> buildTraversalOrder();
|
||||
|
||||
std::vector<Node> Nodes;
|
||||
std::unordered_set<Arc, Arc::Hash> Arcs;
|
||||
std::vector<BinaryFunction *> Funcs;
|
||||
std::unordered_map<const BinaryFunction *, NodeId> FuncToNodeId;
|
||||
};
|
||||
|
||||
inline bool NoFilter(const BinaryFunction &) { return false; }
|
||||
|
||||
/// Builds a call graph from the map of BinaryFunctions provided in BFs.
|
||||
/// The arguments control how the graph is constructed.
|
||||
/// Filter is called on each function, any function that it returns true for
|
||||
/// is omitted from the graph.
|
||||
/// If IncludeColdCalls is true, then calls from cold BBs are considered for the
|
||||
/// graph, otherwise they are ignored.
|
||||
/// UseFunctionHotSize controls whether the hot size of a function is used when
|
||||
/// filling in the Size attribute of new Nodes.
|
||||
/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is
|
||||
/// computed using the offsets of call instructions.
|
||||
CallGraph buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::function<bool (const BinaryFunction &BF)> Filter = NoFilter,
|
||||
bool IncludeColdCalls = true,
|
||||
bool UseFunctionHotSize = false,
|
||||
bool UseEdgeCounts = false);
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
|
@ -275,71 +275,6 @@ FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const {
|
|||
return make_error_code(errc::result_out_of_range);
|
||||
}
|
||||
|
||||
void FrameAnalysis::buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
for (auto &I : BFs) {
|
||||
BinaryFunction &Caller = I.second;
|
||||
|
||||
Functions.emplace(&Caller);
|
||||
|
||||
for (BinaryBasicBlock &BB : Caller) {
|
||||
for (MCInst &Inst : BB) {
|
||||
if (!BC.MIA->isCall(Inst))
|
||||
continue;
|
||||
|
||||
auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
|
||||
if (!TargetSymbol) {
|
||||
// This is an indirect call, we cannot record a target.
|
||||
continue;
|
||||
}
|
||||
|
||||
auto *Function = BC.getFunctionForSymbol(TargetSymbol);
|
||||
if (!Function) {
|
||||
// Call to a function without a BinaryFunction object.
|
||||
continue;
|
||||
}
|
||||
// Create a new edge in the call graph
|
||||
CallGraphEdges[&Caller].emplace_back(Function);
|
||||
ReverseCallGraphEdges[Function].emplace_back(&Caller);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FrameAnalysis::buildCGTraversalOrder() {
|
||||
enum NodeStatus { NEW, VISITING, VISITED };
|
||||
std::unordered_map<const BinaryFunction *, NodeStatus> NodeStatus;
|
||||
std::stack<BinaryFunction *> Worklist;
|
||||
|
||||
for (auto *Func : Functions) {
|
||||
Worklist.push(Func);
|
||||
NodeStatus[Func] = NEW;
|
||||
}
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
auto *Func = Worklist.top();
|
||||
Worklist.pop();
|
||||
|
||||
if (NodeStatus[Func] == VISITED)
|
||||
continue;
|
||||
|
||||
if (NodeStatus[Func] == VISITING) {
|
||||
TopologicalCGOrder.push_back(Func);
|
||||
NodeStatus[Func] = VISITED;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(NodeStatus[Func] == NEW);
|
||||
NodeStatus[Func] = VISITING;
|
||||
Worklist.push(Func);
|
||||
for (auto *Callee : CallGraphEdges[Func]) {
|
||||
if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
|
||||
continue;
|
||||
Worklist.push(Callee);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FrameAnalysis::getInstClobberList(const BinaryContext &BC,
|
||||
const MCInst &Inst,
|
||||
BitVector &KillSet) const {
|
||||
|
@ -412,8 +347,8 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) {
|
|||
}
|
||||
|
||||
if (RegsKilledMap[Func] != RegsKilled || Updated) {
|
||||
for (auto Caller : ReverseCallGraphEdges[Func]) {
|
||||
Queue.push(Caller);
|
||||
for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
|
||||
Queue.push(Cg.Funcs[Caller]);
|
||||
}
|
||||
}
|
||||
RegsKilledMap[Func] = std::move(RegsKilled);
|
||||
|
@ -647,11 +582,11 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC,
|
|||
std::set<uint64_t> &) {
|
||||
{
|
||||
NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true);
|
||||
buildCallGraph(BC, BFs);
|
||||
Cg = buildCallGraph(BC, BFs);
|
||||
}
|
||||
{
|
||||
NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true);
|
||||
buildCGTraversalOrder();
|
||||
TopologicalCGOrder = Cg.buildTraversalOrder();
|
||||
}
|
||||
{
|
||||
NamedRegionTimer T1("build clobber map", "FOP breakdown", true);
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "CallGraph.h"
|
||||
#include "StackPointerTracking.h"
|
||||
|
||||
namespace llvm {
|
||||
|
@ -112,14 +113,8 @@ raw_ostream &operator<<(raw_ostream &OS,
|
|||
///
|
||||
class FrameAnalysis : public BinaryFunctionPass {
|
||||
/// Call graph info
|
||||
/// The set of functions analyzed by our call graph
|
||||
std::set<BinaryFunction *> Functions;
|
||||
/// Model the "function calls function" edges
|
||||
std::map<const BinaryFunction *, std::vector<BinaryFunction *>>
|
||||
CallGraphEdges;
|
||||
/// Model the "function called by function" edges
|
||||
std::map<const BinaryFunction *, std::vector<BinaryFunction *>>
|
||||
ReverseCallGraphEdges;
|
||||
CallGraph Cg;
|
||||
|
||||
/// DFS or reverse post-ordering of the call graph nodes to allow us to
|
||||
/// traverse the call graph bottom-up
|
||||
std::deque<BinaryFunction *> TopologicalCGOrder;
|
||||
|
@ -169,15 +164,6 @@ class FrameAnalysis : public BinaryFunctionPass {
|
|||
void addFIEFor(const BinaryContext &BC, MCInst &Inst,
|
||||
const FrameIndexEntry &FIE);
|
||||
|
||||
/// Perform the initial step of populating CallGraphEdges and
|
||||
/// ReverseCallGraphEdges for all functions in BFs.
|
||||
void buildCallGraph(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
|
||||
/// Compute a DFS traversal of the call graph in Functions, CallGraphEdges
|
||||
/// and ReverseCallGraphEdges and stores it in TopologicalCGOrder.
|
||||
void buildCGTraversalOrder();
|
||||
|
||||
/// Compute the set of registers \p Func may write to during its execution,
|
||||
/// starting at the point when it is called up until when it returns. Returns
|
||||
/// a BitVector the size of the target number of registers, representing the
|
||||
|
|
|
@ -24,71 +24,6 @@ extern cl::opt<unsigned> Verbosity;
|
|||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void FrameOptimizerPass::buildCallGraph(
|
||||
const BinaryContext &BC, std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
for (auto &I : BFs) {
|
||||
BinaryFunction &Caller = I.second;
|
||||
|
||||
Functions.emplace(&Caller);
|
||||
|
||||
for (BinaryBasicBlock &BB : Caller) {
|
||||
for (MCInst &Inst : BB) {
|
||||
if (!BC.MIA->isCall(Inst))
|
||||
continue;
|
||||
|
||||
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
|
||||
if (!TargetSymbol) {
|
||||
// This is an indirect call, we cannot record a target.
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto *Function = BC.getFunctionForSymbol(TargetSymbol);
|
||||
if (!Function) {
|
||||
// Call to a function without a BinaryFunction object.
|
||||
continue;
|
||||
}
|
||||
// Create a new edge in the call graph
|
||||
CallGraphEdges[&Caller].emplace_back(Function);
|
||||
ReverseCallGraphEdges[Function].emplace_back(&Caller);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FrameOptimizerPass::buildCGTraversalOrder() {
|
||||
enum NodeStatus { NEW, VISITING, VISITED };
|
||||
std::unordered_map<const BinaryFunction *, NodeStatus> NodeStatus;
|
||||
std::stack<const BinaryFunction *> Worklist;
|
||||
|
||||
for (auto *Func : Functions) {
|
||||
Worklist.push(Func);
|
||||
NodeStatus[Func] = NEW;
|
||||
}
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
const auto *Func = Worklist.top();
|
||||
Worklist.pop();
|
||||
|
||||
if (NodeStatus[Func] == VISITED)
|
||||
continue;
|
||||
|
||||
if (NodeStatus[Func] == VISITING) {
|
||||
TopologicalCGOrder.push_back(Func);
|
||||
NodeStatus[Func] = VISITED;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(NodeStatus[Func] == NEW);
|
||||
NodeStatus[Func] = VISITING;
|
||||
Worklist.push(Func);
|
||||
for (const auto *Callee : CallGraphEdges[Func]) {
|
||||
if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
|
||||
continue;
|
||||
Worklist.push(Callee);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC,
|
||||
const MCInst &Inst,
|
||||
BitVector &KillSet) const {
|
||||
|
@ -161,8 +96,8 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) {
|
|||
}
|
||||
|
||||
if (RegsKilledMap[Func] != RegsKilled) {
|
||||
for (auto Caller : ReverseCallGraphEdges[Func]) {
|
||||
Queue.push(Caller);
|
||||
for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) {
|
||||
Queue.push(Cg.Funcs[Caller]);
|
||||
}
|
||||
}
|
||||
RegsKilledMap[Func] = std::move(RegsKilled);
|
||||
|
@ -794,8 +729,8 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
|
|||
uint64_t CountFunctionsNotOptimized{0};
|
||||
uint64_t CountFunctionsFailedRestoreFI{0};
|
||||
uint64_t CountDenominator{0};
|
||||
buildCallGraph(BC, BFs);
|
||||
buildCGTraversalOrder();
|
||||
Cg = buildCallGraph(BC, BFs);
|
||||
TopologicalCGOrder = Cg.buildTraversalOrder();
|
||||
buildClobberMap(BC);
|
||||
for (auto &I : BFs) {
|
||||
auto Count = I.second.getExecutionCount();
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "CallGraph.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
@ -75,17 +76,11 @@ class FrameOptimizerPass : public BinaryFunctionPass {
|
|||
uint64_t CountFunctionsAllClobber{0};
|
||||
|
||||
/// Call graph info
|
||||
/// The set of functions analyzed by our call graph
|
||||
std::set<BinaryFunction *> Functions;
|
||||
/// Model the "function calls function" edges
|
||||
std::map<const BinaryFunction *, std::vector<const BinaryFunction *>>
|
||||
CallGraphEdges;
|
||||
/// Model the "function called by function" edges
|
||||
std::map<const BinaryFunction *, std::vector<const BinaryFunction *>>
|
||||
ReverseCallGraphEdges;
|
||||
CallGraph Cg;
|
||||
|
||||
/// DFS or reverse post-ordering of the call graph nodes to allow us to
|
||||
/// traverse the call graph bottom-up
|
||||
std::deque<const BinaryFunction *> TopologicalCGOrder;
|
||||
std::deque<BinaryFunction *> TopologicalCGOrder;
|
||||
|
||||
/// Map functions to the set of registers they may overwrite starting at when
|
||||
/// it is called until it returns to the caller.
|
||||
|
@ -126,15 +121,6 @@ public:
|
|||
void getInstClobberList(const BinaryContext &BC, const MCInst &Inst,
|
||||
BitVector &KillSet) const;
|
||||
private:
|
||||
/// Perform the initial step of populating CallGraphEdges and
|
||||
/// ReverseCallGraphEdges for all functions in BFs.
|
||||
void buildCallGraph(const BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
|
||||
/// Compute a DFS traversal of the call graph in Functions, CallGraphEdges
|
||||
/// and ReverseCallGraphEdges and stores it in TopologicalCGOrder.
|
||||
void buildCGTraversalOrder();
|
||||
|
||||
/// Compute the set of registers \p Func may write to during its execution,
|
||||
/// starting at the point when it is called up until when it returns. Returns
|
||||
/// a BitVector the size of the target number of registers, representing the
|
||||
|
|
|
@ -40,6 +40,10 @@
|
|||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
using NodeId = CallGraph::NodeId;
|
||||
using Arc = CallGraph::Arc;
|
||||
using Node = CallGraph::Node;
|
||||
|
||||
namespace {
|
||||
|
||||
// The number of pages to reserve for the functions with highest
|
||||
|
@ -55,32 +59,11 @@ constexpr double MinArcProbability = 0.1;
|
|||
// willing to degrade it's density by merging a callee.
|
||||
constexpr int CallerDegradeFactor = 8;
|
||||
|
||||
// Maximum size of a cluster, in bytes.
|
||||
constexpr uint32_t MaxClusterSize = 1 << 20;
|
||||
|
||||
constexpr uint32_t PageSize = 2 << 20;
|
||||
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TargetId TargetGraph::addTarget(uint32_t Size, uint32_t Samples) {
|
||||
auto Id = Targets.size();
|
||||
Targets.emplace_back(Size, Samples);
|
||||
return Id;
|
||||
}
|
||||
|
||||
const Arc &TargetGraph::incArcWeight(TargetId Src, TargetId Dst, double W) {
|
||||
auto Res = Arcs.emplace(Src, Dst, W);
|
||||
if (!Res.second) {
|
||||
Res.first->Weight += W;
|
||||
return *Res.first;
|
||||
}
|
||||
Targets[Src].Succs.push_back(Dst);
|
||||
Targets[Dst].Preds.push_back(Src);
|
||||
return *Res.first;
|
||||
}
|
||||
|
||||
Cluster::Cluster(TargetId Id, const TargetNode &Func) {
|
||||
Cluster::Cluster(NodeId Id, const Node &Func) {
|
||||
Targets.push_back(Id);
|
||||
Size = Func.Size;
|
||||
Samples = Func.Samples;
|
||||
|
@ -103,53 +86,47 @@ std::string Cluster::toString() const {
|
|||
}
|
||||
|
||||
namespace {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
|
||||
return C1.density() > C2.density();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void freezeClusters(const TargetGraph &Cg, std::vector<Cluster> &Clusters) {
|
||||
void freezeClusters(const CallGraph &Cg, std::vector<Cluster> &Clusters) {
|
||||
uint32_t TotalSize = 0;
|
||||
std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity);
|
||||
for (auto &C : Clusters) {
|
||||
uint32_t NewSize = TotalSize + C.Size;
|
||||
if (NewSize > FrozenPages * PageSize) break;
|
||||
if (NewSize > FrozenPages * HugePageSize) break;
|
||||
C.Frozen = true;
|
||||
TotalSize = NewSize;
|
||||
auto Fid = C.Targets[0];
|
||||
DEBUG(dbgs() <<
|
||||
format("freezing cluster for func %d, size = %u, samples = %u)\n",
|
||||
Fid, Cg.Targets[Fid].Size, Cg.Targets[Fid].Samples););
|
||||
Fid, Cg.Nodes[Fid].Size, Cg.Nodes[Fid].Samples););
|
||||
}
|
||||
}
|
||||
|
||||
void mergeInto(Cluster &Into, Cluster&& Other, const double Aw = 0) {
|
||||
Into.Targets.insert(Into.Targets.end(),
|
||||
Other.Targets.begin(),
|
||||
Other.Targets.end());
|
||||
Into.Size += Other.Size;
|
||||
Into.Samples += Other.Samples;
|
||||
}
|
||||
|
||||
void Cluster::merge(Cluster&& Other, const double Aw) {
|
||||
Targets.insert(Targets.end(),
|
||||
Other.Targets.begin(),
|
||||
Other.Targets.end());
|
||||
Size += Other.Size;
|
||||
Samples += Other.Samples;
|
||||
|
||||
Other.Size = 0;
|
||||
Other.Samples = 0;
|
||||
Other.Targets.clear();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
||||
std::vector<TargetId> SortedFuncs;
|
||||
std::vector<Cluster> clusterize(const CallGraph &Cg) {
|
||||
std::vector<NodeId> SortedFuncs;
|
||||
|
||||
// indexed by TargetId, keeps it's current cluster
|
||||
std::vector<Cluster*> FuncCluster(Cg.Targets.size(), nullptr);
|
||||
// indexed by NodeId, keeps it's current cluster
|
||||
std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
|
||||
std::vector<Cluster> Clusters;
|
||||
Clusters.reserve(Cg.Targets.size());
|
||||
Clusters.reserve(Cg.Nodes.size());
|
||||
|
||||
for (TargetId F = 0; F < Cg.Targets.size(); F++) {
|
||||
if (Cg.Targets[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Targets[F]);
|
||||
for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
|
||||
if (Cg.Nodes[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Nodes[F]);
|
||||
SortedFuncs.push_back(F);
|
||||
}
|
||||
|
||||
|
@ -164,9 +141,9 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
|||
std::sort(
|
||||
SortedFuncs.begin(),
|
||||
SortedFuncs.end(),
|
||||
[&] (const TargetId F1, const TargetId F2) {
|
||||
const auto &Func1 = Cg.Targets[F1];
|
||||
const auto &Func2 = Cg.Targets[F2];
|
||||
[&] (const NodeId F1, const NodeId F2) {
|
||||
const auto &Func1 = Cg.Nodes[F1];
|
||||
const auto &Func2 = Cg.Nodes[F2];
|
||||
return
|
||||
(uint64_t)Func1.Samples * Func2.Size > // TODO: is this correct?
|
||||
(uint64_t)Func2.Samples * Func1.Size;
|
||||
|
@ -180,12 +157,12 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
|||
if (Cluster->Frozen) continue;
|
||||
|
||||
// Find best predecessor.
|
||||
TargetId BestPred = InvalidId;
|
||||
NodeId BestPred = CallGraph::InvalidId;
|
||||
double BestProb = 0;
|
||||
|
||||
for (const auto Src : Cg.Targets[Fid].Preds) {
|
||||
for (const auto Src : Cg.Nodes[Fid].Preds) {
|
||||
auto &A = *Cg.Arcs.find(Arc(Src, Fid));
|
||||
if (BestPred == InvalidId || A.NormalizedWeight > BestProb) {
|
||||
if (BestPred == CallGraph::InvalidId || A.NormalizedWeight > BestProb) {
|
||||
BestPred = A.Src;
|
||||
BestProb = A.NormalizedWeight;
|
||||
}
|
||||
|
@ -196,7 +173,7 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
|||
// caller is too low.
|
||||
if (BestProb < MinArcProbability) continue;
|
||||
|
||||
assert(BestPred != InvalidId);
|
||||
assert(BestPred != CallGraph::InvalidId);
|
||||
|
||||
auto PredCluster = FuncCluster[BestPred];
|
||||
|
||||
|
@ -223,13 +200,13 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
|||
DEBUG(dbgs() << format("merging %s -> %s: %u\n",
|
||||
PredCluster->toString().c_str(),
|
||||
Cluster->toString().c_str(),
|
||||
Cg.Targets[Fid].Samples););
|
||||
Cg.Nodes[Fid].Samples););
|
||||
|
||||
for (auto F : Cluster->Targets) {
|
||||
FuncCluster[F] = PredCluster;
|
||||
}
|
||||
|
||||
mergeInto(*PredCluster, std::move(*Cluster));
|
||||
PredCluster->merge(std::move(*Cluster));
|
||||
}
|
||||
|
||||
// Return the set of Clusters that are left, which are the ones that
|
||||
|
@ -250,203 +227,14 @@ std::vector<Cluster> clusterize(const TargetGraph &Cg) {
|
|||
return SortedClusters;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace {
|
||||
class ClusterArc {
|
||||
public:
|
||||
ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0)
|
||||
: C1(std::min(Ca, Cb))
|
||||
, C2(std::max(Ca, Cb))
|
||||
, Weight(W)
|
||||
{}
|
||||
|
||||
friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) {
|
||||
return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2;
|
||||
}
|
||||
|
||||
Cluster *const C1;
|
||||
Cluster *const C2;
|
||||
mutable double Weight;
|
||||
};
|
||||
|
||||
class ClusterArcHash {
|
||||
public:
|
||||
int64_t operator()(const ClusterArc &Arc) const {
|
||||
std::hash<int64_t> Hasher;
|
||||
return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2));
|
||||
}
|
||||
};
|
||||
|
||||
using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;
|
||||
|
||||
void orderFuncs(const TargetGraph &Cg, Cluster *C1, Cluster *C2) {
|
||||
TargetId C1head = C1->Targets.front();
|
||||
TargetId C1tail = C1->Targets.back();
|
||||
TargetId C2head = C2->Targets.front();
|
||||
TargetId C2tail = C2->Targets.back();
|
||||
|
||||
double C1headC2head = 0;
|
||||
double C1headC2tail = 0;
|
||||
double C1tailC2head = 0;
|
||||
double C1tailC2tail = 0;
|
||||
|
||||
for (const auto &Arc : Cg.Arcs) {
|
||||
if ((Arc.Src == C1head && Arc.Dst == C2head) ||
|
||||
(Arc.Dst == C1head && Arc.Src == C2head)) {
|
||||
C1headC2head += Arc.Weight;
|
||||
} else if ((Arc.Src == C1head && Arc.Dst == C2tail) ||
|
||||
(Arc.Dst == C1head && Arc.Src == C2tail)) {
|
||||
C1headC2tail += Arc.Weight;
|
||||
} else if ((Arc.Src == C1tail && Arc.Dst == C2head) ||
|
||||
(Arc.Dst == C1tail && Arc.Src == C2head)) {
|
||||
C1tailC2head += Arc.Weight;
|
||||
} else if ((Arc.Src == C1tail && Arc.Dst == C2tail) ||
|
||||
(Arc.Dst == C1tail && Arc.Src == C2tail)) {
|
||||
C1tailC2tail += Arc.Weight;
|
||||
}
|
||||
}
|
||||
|
||||
const double Max = std::max(std::max(C1headC2head, C1headC2tail),
|
||||
std::max(C1tailC2head, C1tailC2tail));
|
||||
|
||||
if (C1headC2head == Max) {
|
||||
// flip C1
|
||||
std::reverse(C1->Targets.begin(), C1->Targets.end());
|
||||
} else if (C1headC2tail == Max) {
|
||||
// flip C1 C2
|
||||
std::reverse(C1->Targets.begin(), C1->Targets.end());
|
||||
std::reverse(C2->Targets.begin(), C2->Targets.end());
|
||||
} else if (C1tailC2tail == Max) {
|
||||
// flip C2
|
||||
std::reverse(C2->Targets.begin(), C2->Targets.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Cluster> pettisAndHansen(const TargetGraph &Cg) {
|
||||
// indexed by TargetId, keeps its current cluster
|
||||
std::vector<Cluster*> FuncCluster(Cg.Targets.size(), nullptr);
|
||||
std::vector<Cluster> randomClusters(const CallGraph &Cg) {
|
||||
std::vector<NodeId> FuncIds(Cg.Nodes.size(), 0);
|
||||
std::vector<Cluster> Clusters;
|
||||
std::vector<TargetId> Funcs;
|
||||
Clusters.reserve(Cg.Nodes.size());
|
||||
|
||||
Clusters.reserve(Cg.Targets.size());
|
||||
|
||||
for (TargetId F = 0; F < Cg.Targets.size(); F++) {
|
||||
if (Cg.Targets[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Targets[F]);
|
||||
FuncCluster[F] = &Clusters.back();
|
||||
Funcs.push_back(F);
|
||||
}
|
||||
|
||||
ClusterArcSet Carcs;
|
||||
|
||||
auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) {
|
||||
auto Res = Carcs.emplace(C1, C2, Weight);
|
||||
if (!Res.second) {
|
||||
Res.first->Weight += Weight;
|
||||
}
|
||||
};
|
||||
|
||||
// Create a std::vector of cluster arcs
|
||||
|
||||
for (auto &Arc : Cg.Arcs) {
|
||||
if (Arc.Weight == 0) continue;
|
||||
|
||||
auto const S = FuncCluster[Arc.Src];
|
||||
auto const D = FuncCluster[Arc.Dst];
|
||||
|
||||
// ignore if s or d is nullptr
|
||||
|
||||
if (S == nullptr || D == nullptr) continue;
|
||||
|
||||
// ignore self-edges
|
||||
|
||||
if (S == D) continue;
|
||||
|
||||
insertOrInc(S, D, Arc.Weight);
|
||||
}
|
||||
|
||||
// Find an arc with max weight and merge its nodes
|
||||
|
||||
while (!Carcs.empty()) {
|
||||
auto Maxpos = std::max_element(
|
||||
Carcs.begin(),
|
||||
Carcs.end(),
|
||||
[&] (const ClusterArc &Carc1, const ClusterArc &Carc2) {
|
||||
return Carc1.Weight < Carc2.Weight;
|
||||
}
|
||||
);
|
||||
|
||||
auto Max = *Maxpos;
|
||||
Carcs.erase(Maxpos);
|
||||
|
||||
auto const C1 = Max.C1;
|
||||
auto const C2 = Max.C2;
|
||||
|
||||
if (C1->Size + C2->Size > MaxClusterSize) continue;
|
||||
|
||||
if (C1->Frozen || C2->Frozen) continue;
|
||||
|
||||
// order functions and merge cluster
|
||||
|
||||
orderFuncs(Cg, C1, C2);
|
||||
|
||||
DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(),
|
||||
C1->toString().c_str(), Max.Weight););
|
||||
|
||||
// update carcs: merge C1arcs to C2arcs
|
||||
|
||||
std::unordered_map<ClusterArc, Cluster *, ClusterArcHash> C2arcs;
|
||||
for (auto &Carc : Carcs) {
|
||||
if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2);
|
||||
if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1);
|
||||
}
|
||||
|
||||
for (auto It : C2arcs) {
|
||||
auto const C = It.second;
|
||||
auto const C2arc = It.first;
|
||||
|
||||
insertOrInc(C, C1, C2arc.Weight);
|
||||
Carcs.erase(C2arc);
|
||||
}
|
||||
|
||||
// update FuncCluster
|
||||
|
||||
for (auto F : C2->Targets) {
|
||||
FuncCluster[F] = C1;
|
||||
}
|
||||
mergeInto(*C1, std::move(*C2), Max.Weight);
|
||||
}
|
||||
|
||||
// Return the set of Clusters that are left, which are the ones that
|
||||
// didn't get merged.
|
||||
|
||||
std::set<Cluster*> LiveClusters;
|
||||
std::vector<Cluster> OutClusters;
|
||||
|
||||
for (auto Fid : Funcs) {
|
||||
LiveClusters.insert(FuncCluster[Fid]);
|
||||
}
|
||||
for (auto C : LiveClusters) {
|
||||
OutClusters.push_back(std::move(*C));
|
||||
}
|
||||
|
||||
std::sort(OutClusters.begin(),
|
||||
OutClusters.end(),
|
||||
compareClustersDensity);
|
||||
|
||||
return OutClusters;
|
||||
}
|
||||
|
||||
std::vector<Cluster> randomClusters(const TargetGraph &Cg) {
|
||||
std::vector<TargetId> FuncIds(Cg.Targets.size(), 0);
|
||||
std::vector<Cluster> Clusters;
|
||||
Clusters.reserve(Cg.Targets.size());
|
||||
|
||||
for (TargetId F = 0; F < Cg.Targets.size(); F++) {
|
||||
if (Cg.Targets[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Targets[F]);
|
||||
for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
|
||||
if (Cg.Nodes[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Nodes[F]);
|
||||
}
|
||||
|
||||
std::sort(Clusters.begin(),
|
||||
|
@ -477,7 +265,7 @@ std::vector<Cluster> randomClusters(const TargetGraph &Cg) {
|
|||
if (MergeIdx == Clusters.size()) {
|
||||
++Idx;
|
||||
} else {
|
||||
mergeInto(Clusters[Idx], std::move(Clusters[MergeIdx]));
|
||||
Clusters[Idx].merge(std::move(Clusters[MergeIdx]));
|
||||
Clusters.erase(Clusters.begin() + MergeIdx);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,157 +37,60 @@
|
|||
#ifndef LLVM_TOOLS_LLVM_BOLT_HFSORT_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_HFSORT_H
|
||||
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include "CallGraph.h"
|
||||
|
||||
#if defined(__x86_64__) && !defined(_MSC_VER)
|
||||
# if (!defined USE_SSECRC)
|
||||
# define USE_SSECRC
|
||||
# endif
|
||||
#else
|
||||
# undef USE_SSECRC
|
||||
#endif
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
using TargetId = size_t;
|
||||
constexpr TargetId InvalidId = -1;
|
||||
|
||||
class Arc {
|
||||
public:
|
||||
Arc(TargetId S, TargetId D, double W = 0)
|
||||
: Src(S)
|
||||
, Dst(D)
|
||||
, Weight(W)
|
||||
{}
|
||||
Arc(const Arc&) = delete;
|
||||
|
||||
friend bool operator==(const Arc &Lhs, const Arc &Rhs) {
|
||||
return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
|
||||
}
|
||||
|
||||
const TargetId Src;
|
||||
const TargetId Dst;
|
||||
mutable double Weight;
|
||||
mutable double NormalizedWeight{0};
|
||||
mutable double AvgCallOffset{0};
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
|
||||
std::hash<int64_t> Hasher;
|
||||
return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
|
||||
}
|
||||
|
||||
inline size_t hash_int64_fallback(int64_t key) {
|
||||
// "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
|
||||
// http://www.concentric.net/~ttwang/tech/inthash.htm
|
||||
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
|
||||
key = key ^ ((unsigned long long)key >> 24);
|
||||
key = (key + (key << 3)) + (key << 8); // key * 265
|
||||
key = key ^ ((unsigned long long)key >> 14);
|
||||
key = (key + (key << 2)) + (key << 4); // key * 21
|
||||
key = key ^ ((unsigned long long)key >> 28);
|
||||
return static_cast<size_t>(static_cast<uint32_t>(key));
|
||||
}
|
||||
|
||||
inline size_t hash_int64(int64_t k) {
|
||||
#if defined(USE_SSECRC) && defined(__SSE4_2__)
|
||||
size_t h = 0;
|
||||
__asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
|
||||
return h;
|
||||
#else
|
||||
return hash_int64_fallback(k);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
|
||||
#if defined(USE_SSECRC) && defined(__SSE4_2__)
|
||||
// crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
|
||||
// differently from (k2, k1).
|
||||
k1 += k1;
|
||||
__asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
|
||||
return k1;
|
||||
#else
|
||||
return (hash_int64(k1) << 1) ^ hash_int64(k2);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class ArcHash {
|
||||
public:
|
||||
int64_t operator()(const Arc &Arc) const {
|
||||
#ifdef USE_STD_HASH
|
||||
std::hash<int64_t> Hasher;
|
||||
return hashCombine(Hasher(Arc.Src), Arc.Dst);
|
||||
#else
|
||||
return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
class TargetNode {
|
||||
public:
|
||||
explicit TargetNode(uint32_t Size, uint32_t Samples = 0)
|
||||
: Size(Size), Samples(Samples)
|
||||
{}
|
||||
|
||||
uint32_t Size;
|
||||
uint32_t Samples;
|
||||
|
||||
// preds and succs contain no duplicate elements and self arcs are not allowed
|
||||
std::vector<TargetId> Preds;
|
||||
std::vector<TargetId> Succs;
|
||||
};
|
||||
|
||||
class TargetGraph {
|
||||
public:
|
||||
TargetId addTarget(uint32_t Size, uint32_t Samples = 0);
|
||||
const Arc &incArcWeight(TargetId Src, TargetId Dst, double W = 1.0);
|
||||
|
||||
std::vector<TargetNode> Targets;
|
||||
std::unordered_set<Arc, ArcHash> Arcs;
|
||||
};
|
||||
|
||||
class Cluster {
|
||||
public:
|
||||
Cluster(TargetId Id, const TargetNode &F);
|
||||
Cluster(CallGraph::NodeId Id, const CallGraph::Node &F);
|
||||
|
||||
std::string toString() const;
|
||||
double density() const {
|
||||
return (double)Samples / Size;
|
||||
}
|
||||
|
||||
std::vector<TargetId> Targets;
|
||||
void merge(Cluster &&Other, const double Aw = 0);
|
||||
|
||||
std::vector<CallGraph::NodeId> Targets;
|
||||
uint32_t Samples;
|
||||
uint32_t Size;
|
||||
bool Frozen; // not a candidate for merging
|
||||
};
|
||||
|
||||
// Maximum size of a cluster, in bytes.
|
||||
constexpr uint32_t MaxClusterSize = 1 << 20;
|
||||
|
||||
// Size of a huge page in bytes.
|
||||
constexpr uint32_t HugePageSize = 2 << 20;
|
||||
|
||||
inline bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
|
||||
return C1.density() > C2.density();
|
||||
}
|
||||
|
||||
/*
|
||||
* Cluster functions in order to minimize call distance.
|
||||
*/
|
||||
std::vector<Cluster> clusterize(const TargetGraph &Cg);
|
||||
std::vector<Cluster> clusterize(const CallGraph &Cg);
|
||||
|
||||
/*
|
||||
* Optimize function placement for iTLB cache and i-cache.
|
||||
*/
|
||||
std::vector<Cluster> hfsortPlus(const TargetGraph &Cg);
|
||||
std::vector<Cluster> hfsortPlus(const CallGraph &Cg);
|
||||
|
||||
/*
|
||||
* Pettis-Hansen code layout algorithm
|
||||
* reference: K. Pettis and R. C. Hansen, "Profile Guided Code Positioning",
|
||||
* PLDI '90
|
||||
*/
|
||||
std::vector<Cluster> pettisAndHansen(const TargetGraph &Cg);
|
||||
std::vector<Cluster> pettisAndHansen(const CallGraph &Cg);
|
||||
|
||||
/* Group functions into clusters randomly. */
|
||||
std::vector<Cluster> randomClusters(const TargetGraph &Cg);
|
||||
std::vector<Cluster> randomClusters(const CallGraph &Cg);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,10 @@
|
|||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
using NodeId = CallGraph::NodeId;
|
||||
using Arc = CallGraph::Arc;
|
||||
using Node = CallGraph::Node;
|
||||
|
||||
namespace {
|
||||
|
||||
// The size of a cache page
|
||||
|
@ -117,7 +121,7 @@ class PrecomputedResults {
|
|||
// A wrapper for algorthm-wide variables
|
||||
struct AlgoState {
|
||||
// the call graph
|
||||
const TargetGraph *Cg;
|
||||
const CallGraph *Cg;
|
||||
// the total number of samples in the graph
|
||||
double TotalSamples;
|
||||
// target_id => cluster
|
||||
|
@ -126,10 +130,6 @@ struct AlgoState {
|
|||
std::vector<size_t> Addr;
|
||||
};
|
||||
|
||||
bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
|
||||
return C1.density() > C2.density();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -199,7 +199,7 @@ double expectedCacheHitRatio(const AlgoState &State,
|
|||
sortByDensity(Clusters);
|
||||
|
||||
// generate function addresses with an alignment
|
||||
std::vector<size_t> Addr(State.Cg->Targets.size(), InvalidAddr);
|
||||
std::vector<size_t> Addr(State.Cg->Nodes.size(), InvalidAddr);
|
||||
size_t CurAddr = 0;
|
||||
// 'hotness' of the pages
|
||||
std::vector<double> PageSamples;
|
||||
|
@ -207,11 +207,11 @@ double expectedCacheHitRatio(const AlgoState &State,
|
|||
for (auto TargetId : Cluster->Targets) {
|
||||
if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16;
|
||||
Addr[TargetId] = CurAddr;
|
||||
CurAddr += State.Cg->Targets[TargetId].Size;
|
||||
CurAddr += State.Cg->Nodes[TargetId].Size;
|
||||
// update page weight
|
||||
size_t Page = Addr[TargetId] / PageSize;
|
||||
while (PageSamples.size() <= Page) PageSamples.push_back(0.0);
|
||||
PageSamples[Page] += State.Cg->Targets[TargetId].Samples;
|
||||
PageSamples[Page] += State.Cg->Nodes[TargetId].Samples;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -220,12 +220,12 @@ double expectedCacheHitRatio(const AlgoState &State,
|
|||
for (auto Cluster : Clusters) {
|
||||
for (auto TargetId : Cluster->Targets) {
|
||||
size_t Page = Addr[TargetId] / PageSize;
|
||||
double Samples = State.Cg->Targets[TargetId].Samples;
|
||||
double Samples = State.Cg->Nodes[TargetId].Samples;
|
||||
// probability that the page is not present in the cache
|
||||
double MissProb = missProbability(State, PageSamples[Page]);
|
||||
|
||||
for (auto Pred : State.Cg->Targets[TargetId].Preds) {
|
||||
if (State.Cg->Targets[Pred].Samples == 0) continue;
|
||||
for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
|
||||
if (State.Cg->Nodes[Pred].Samples == 0) continue;
|
||||
auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));
|
||||
|
||||
// the source page
|
||||
|
@ -252,13 +252,13 @@ std::unordered_set<Cluster *> adjacentClusters(const AlgoState &State,
|
|||
Cluster *C) {
|
||||
std::unordered_set<Cluster *> Result;
|
||||
for (auto TargetId : C->Targets) {
|
||||
for (auto Succ : State.Cg->Targets[TargetId].Succs) {
|
||||
for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
|
||||
auto SuccCluster = State.FuncCluster[Succ];
|
||||
if (SuccCluster != nullptr && SuccCluster != C) {
|
||||
Result.insert(SuccCluster);
|
||||
}
|
||||
}
|
||||
for (auto Pred : State.Cg->Targets[TargetId].Preds) {
|
||||
for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
|
||||
auto PredCluster = State.FuncCluster[Pred];
|
||||
if (PredCluster != nullptr && PredCluster != C) {
|
||||
Result.insert(PredCluster);
|
||||
|
@ -286,7 +286,7 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) {
|
|||
double shortCalls(const AlgoState &State, Cluster *Cluster) {
|
||||
double Calls = 0;
|
||||
for (auto TargetId : Cluster->Targets) {
|
||||
for (auto Succ : State.Cg->Targets[TargetId].Succs) {
|
||||
for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
|
||||
if (State.FuncCluster[Succ] == Cluster) {
|
||||
auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));
|
||||
|
||||
|
@ -310,7 +310,7 @@ double shortCalls(const AlgoState &State,
|
|||
Cluster *ClusterSucc) {
|
||||
double Calls = 0;
|
||||
for (auto TargetId : ClusterPred->Targets) {
|
||||
for (auto Succ : State.Cg->Targets[TargetId].Succs) {
|
||||
for (auto Succ : State.Cg->Nodes[TargetId].Succs) {
|
||||
if (State.FuncCluster[Succ] == ClusterSucc) {
|
||||
auto A = State.Cg->Arcs.find(Arc(TargetId, Succ));
|
||||
|
||||
|
@ -323,7 +323,7 @@ double shortCalls(const AlgoState &State,
|
|||
}
|
||||
|
||||
for (auto TargetId : ClusterPred->Targets) {
|
||||
for (auto Pred : State.Cg->Targets[TargetId].Preds) {
|
||||
for (auto Pred : State.Cg->Nodes[TargetId].Preds) {
|
||||
if (State.FuncCluster[Pred] == ClusterSucc) {
|
||||
auto A = State.Cg->Arcs.find(Arc(Pred, TargetId));
|
||||
|
||||
|
@ -389,7 +389,7 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
|
|||
for (auto TargetId : Into->Targets) {
|
||||
State.FuncCluster[TargetId] = Into;
|
||||
State.Addr[TargetId] = CurAddr;
|
||||
CurAddr += State.Cg->Targets[TargetId].Size;
|
||||
CurAddr += State.Cg->Nodes[TargetId].Size;
|
||||
}
|
||||
|
||||
Other->Size = 0;
|
||||
|
@ -400,29 +400,29 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) {
|
|||
/*
|
||||
* HFSortPlus - layout of hot functions with iTLB cache optimization
|
||||
*/
|
||||
std::vector<Cluster> hfsortPlus(const TargetGraph &Cg) {
|
||||
std::vector<Cluster> hfsortPlus(const CallGraph &Cg) {
|
||||
// create a cluster for every function
|
||||
std::vector<Cluster> AllClusters;
|
||||
AllClusters.reserve(Cg.Targets.size());
|
||||
for (TargetId F = 0; F < Cg.Targets.size(); F++) {
|
||||
AllClusters.emplace_back(F, Cg.Targets[F]);
|
||||
AllClusters.reserve(Cg.Nodes.size());
|
||||
for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
|
||||
AllClusters.emplace_back(F, Cg.Nodes[F]);
|
||||
}
|
||||
|
||||
// initialize objects used by the algorithm
|
||||
std::vector<Cluster *> Clusters;
|
||||
Clusters.reserve(Cg.Targets.size());
|
||||
Clusters.reserve(Cg.Nodes.size());
|
||||
AlgoState State;
|
||||
State.Cg = &Cg;
|
||||
State.TotalSamples = 0;
|
||||
State.FuncCluster = std::vector<Cluster *>(Cg.Targets.size(), nullptr);
|
||||
State.Addr = std::vector<size_t>(Cg.Targets.size(), InvalidAddr);
|
||||
for (TargetId F = 0; F < Cg.Targets.size(); F++) {
|
||||
if (Cg.Targets[F].Samples == 0) continue;
|
||||
State.FuncCluster = std::vector<Cluster *>(Cg.Nodes.size(), nullptr);
|
||||
State.Addr = std::vector<size_t>(Cg.Nodes.size(), InvalidAddr);
|
||||
for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
|
||||
if (Cg.Nodes[F].Samples == 0) continue;
|
||||
|
||||
Clusters.push_back(&AllClusters[F]);
|
||||
State.FuncCluster[F] = &AllClusters[F];
|
||||
State.Addr[F] = 0;
|
||||
State.TotalSamples += Cg.Targets[F].Samples;
|
||||
State.TotalSamples += Cg.Nodes[F].Samples;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n"
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
#include "HFSort.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/Format.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#undef DEBUG_TYPE
|
||||
#define DEBUG_TYPE "hfsort"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
using NodeId = CallGraph::NodeId;
|
||||
using Arc = CallGraph::Arc;
|
||||
using Node = CallGraph::Node;
|
||||
|
||||
namespace {
|
||||
class ClusterArc {
|
||||
public:
|
||||
ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0)
|
||||
: C1(std::min(Ca, Cb))
|
||||
, C2(std::max(Ca, Cb))
|
||||
, Weight(W)
|
||||
{}
|
||||
|
||||
friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) {
|
||||
return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2;
|
||||
}
|
||||
|
||||
Cluster *const C1;
|
||||
Cluster *const C2;
|
||||
mutable double Weight;
|
||||
};
|
||||
|
||||
class ClusterArcHash {
|
||||
public:
|
||||
int64_t operator()(const ClusterArc &Arc) const {
|
||||
std::hash<int64_t> Hasher;
|
||||
return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2));
|
||||
}
|
||||
};
|
||||
|
||||
using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;
|
||||
|
||||
void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) {
|
||||
auto C1head = C1->Targets.front();
|
||||
auto C1tail = C1->Targets.back();
|
||||
auto C2head = C2->Targets.front();
|
||||
auto C2tail = C2->Targets.back();
|
||||
|
||||
double C1headC2head = 0;
|
||||
double C1headC2tail = 0;
|
||||
double C1tailC2head = 0;
|
||||
double C1tailC2tail = 0;
|
||||
|
||||
for (const auto &Arc : Cg.Arcs) {
|
||||
if ((Arc.Src == C1head && Arc.Dst == C2head) ||
|
||||
(Arc.Dst == C1head && Arc.Src == C2head)) {
|
||||
C1headC2head += Arc.Weight;
|
||||
} else if ((Arc.Src == C1head && Arc.Dst == C2tail) ||
|
||||
(Arc.Dst == C1head && Arc.Src == C2tail)) {
|
||||
C1headC2tail += Arc.Weight;
|
||||
} else if ((Arc.Src == C1tail && Arc.Dst == C2head) ||
|
||||
(Arc.Dst == C1tail && Arc.Src == C2head)) {
|
||||
C1tailC2head += Arc.Weight;
|
||||
} else if ((Arc.Src == C1tail && Arc.Dst == C2tail) ||
|
||||
(Arc.Dst == C1tail && Arc.Src == C2tail)) {
|
||||
C1tailC2tail += Arc.Weight;
|
||||
}
|
||||
}
|
||||
|
||||
const double Max = std::max(std::max(C1headC2head, C1headC2tail),
|
||||
std::max(C1tailC2head, C1tailC2tail));
|
||||
|
||||
if (C1headC2head == Max) {
|
||||
// flip C1
|
||||
std::reverse(C1->Targets.begin(), C1->Targets.end());
|
||||
} else if (C1headC2tail == Max) {
|
||||
// flip C1 C2
|
||||
std::reverse(C1->Targets.begin(), C1->Targets.end());
|
||||
std::reverse(C2->Targets.begin(), C2->Targets.end());
|
||||
} else if (C1tailC2tail == Max) {
|
||||
// flip C2
|
||||
std::reverse(C2->Targets.begin(), C2->Targets.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
|
||||
// indexed by NodeId, keeps its current cluster
|
||||
std::vector<Cluster*> FuncCluster(Cg.Nodes.size(), nullptr);
|
||||
std::vector<Cluster> Clusters;
|
||||
std::vector<NodeId> Funcs;
|
||||
|
||||
Clusters.reserve(Cg.Nodes.size());
|
||||
|
||||
for (NodeId F = 0; F < Cg.Nodes.size(); F++) {
|
||||
if (Cg.Nodes[F].Samples == 0) continue;
|
||||
Clusters.emplace_back(F, Cg.Nodes[F]);
|
||||
FuncCluster[F] = &Clusters.back();
|
||||
Funcs.push_back(F);
|
||||
}
|
||||
|
||||
ClusterArcSet Carcs;
|
||||
|
||||
auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) {
|
||||
auto Res = Carcs.emplace(C1, C2, Weight);
|
||||
if (!Res.second) {
|
||||
Res.first->Weight += Weight;
|
||||
}
|
||||
};
|
||||
|
||||
// Create a std::vector of cluster arcs
|
||||
|
||||
for (auto &Arc : Cg.Arcs) {
|
||||
if (Arc.Weight == 0) continue;
|
||||
|
||||
auto const S = FuncCluster[Arc.Src];
|
||||
auto const D = FuncCluster[Arc.Dst];
|
||||
|
||||
// ignore if s or d is nullptr
|
||||
|
||||
if (S == nullptr || D == nullptr) continue;
|
||||
|
||||
// ignore self-edges
|
||||
|
||||
if (S == D) continue;
|
||||
|
||||
insertOrInc(S, D, Arc.Weight);
|
||||
}
|
||||
|
||||
// Find an arc with max weight and merge its nodes
|
||||
|
||||
while (!Carcs.empty()) {
|
||||
auto Maxpos = std::max_element(
|
||||
Carcs.begin(),
|
||||
Carcs.end(),
|
||||
[&] (const ClusterArc &Carc1, const ClusterArc &Carc2) {
|
||||
return Carc1.Weight < Carc2.Weight;
|
||||
}
|
||||
);
|
||||
|
||||
auto Max = *Maxpos;
|
||||
Carcs.erase(Maxpos);
|
||||
|
||||
auto const C1 = Max.C1;
|
||||
auto const C2 = Max.C2;
|
||||
|
||||
if (C1->Size + C2->Size > MaxClusterSize) continue;
|
||||
|
||||
if (C1->Frozen || C2->Frozen) continue;
|
||||
|
||||
// order functions and merge cluster
|
||||
|
||||
orderFuncs(Cg, C1, C2);
|
||||
|
||||
DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(),
|
||||
C1->toString().c_str(), Max.Weight););
|
||||
|
||||
// update carcs: merge C1arcs to C2arcs
|
||||
|
||||
std::unordered_map<ClusterArc, Cluster *, ClusterArcHash> C2arcs;
|
||||
for (auto &Carc : Carcs) {
|
||||
if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2);
|
||||
if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1);
|
||||
}
|
||||
|
||||
for (auto It : C2arcs) {
|
||||
auto const C = It.second;
|
||||
auto const C2arc = It.first;
|
||||
|
||||
insertOrInc(C, C1, C2arc.Weight);
|
||||
Carcs.erase(C2arc);
|
||||
}
|
||||
|
||||
// update FuncCluster
|
||||
|
||||
for (auto F : C2->Targets) {
|
||||
FuncCluster[F] = C1;
|
||||
}
|
||||
C1->merge(std::move(*C2), Max.Weight);
|
||||
}
|
||||
|
||||
// Return the set of Clusters that are left, which are the ones that
|
||||
// didn't get merged.
|
||||
|
||||
std::set<Cluster*> LiveClusters;
|
||||
std::vector<Cluster> OutClusters;
|
||||
|
||||
for (auto Fid : Funcs) {
|
||||
LiveClusters.insert(FuncCluster[Fid]);
|
||||
}
|
||||
for (auto C : LiveClusters) {
|
||||
OutClusters.push_back(std::move(*C));
|
||||
}
|
||||
|
||||
std::sort(OutClusters.begin(),
|
||||
OutClusters.end(),
|
||||
compareClustersDensity);
|
||||
|
||||
return OutClusters;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,406 @@
|
|||
//===--- ReorderFunctions.cpp - Function reordering pass ------------ -----===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ReorderFunctions.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include <fstream>
|
||||
|
||||
#define DEBUG_TYPE "hfsort"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
extern cl::opt<bool> Relocs;
|
||||
extern cl::opt<uint32_t> RandomSeed;
|
||||
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
extern size_t padFunction(const bolt::BinaryFunction &Function);
|
||||
|
||||
cl::opt<bolt::BinaryFunction::ReorderType>
|
||||
ReorderFunctions("reorder-functions",
|
||||
cl::desc("reorder and cluster functions (works only with relocations)"),
|
||||
cl::init(bolt::BinaryFunction::RT_NONE),
|
||||
cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE,
|
||||
"none",
|
||||
"do not reorder functions"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT,
|
||||
"exec-count",
|
||||
"order by execution count"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_HFSORT,
|
||||
"hfsort",
|
||||
"use hfsort algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS,
|
||||
"hfsort+",
|
||||
"use hfsort+ algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN,
|
||||
"pettis-hansen",
|
||||
"use Pettis-Hansen algorithm"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_RANDOM,
|
||||
"random",
|
||||
"reorder functions randomly"),
|
||||
clEnumValN(bolt::BinaryFunction::RT_USER,
|
||||
"user",
|
||||
"use function order specified by -function-order"),
|
||||
clEnumValEnd),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
ReorderFunctionsUseHotSize("reorder-functions-use-hot-size",
|
||||
cl::desc("use a function's hot size when doing clustering"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
FunctionOrderFile("function-order",
|
||||
cl::desc("file containing an ordered list of functions to use for function "
|
||||
"reordering"),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
GenerateFunctionOrderFile("generate-function-order",
|
||||
cl::desc("file to dump the ordered list of functions to use for function "
|
||||
"reordering"),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
UseEdgeCounts("use-edge-counts",
|
||||
cl::desc("use edge count data when doing clustering"),
|
||||
cl::init(true),
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
} // namespace opts
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
using NodeId = CallGraph::NodeId;
|
||||
using Arc = CallGraph::Arc;
|
||||
using Node = CallGraph::Node;
|
||||
|
||||
void ReorderFunctions::normalizeArcWeights() {
|
||||
// Normalize arc weights.
|
||||
if (!opts::UseEdgeCounts) {
|
||||
for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
|
||||
auto& Func = Cg.Nodes[FuncId];
|
||||
for (auto Caller : Func.Preds) {
|
||||
auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
|
||||
A.NormalizedWeight = A.Weight / Func.Samples;
|
||||
A.AvgCallOffset /= A.Weight;
|
||||
assert(A.AvgCallOffset < Cg.Nodes[Caller].Size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) {
|
||||
auto &Func = Cg.Nodes[FuncId];
|
||||
for (auto Caller : Func.Preds) {
|
||||
auto& A = *Cg.Arcs.find(Arc(Caller, FuncId));
|
||||
A.NormalizedWeight = A.Weight / Func.Samples;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
|
||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||
std::vector<uint64_t> FuncAddr(Cg.Nodes.size()); // Just for computing stats
|
||||
uint64_t TotalSize = 0;
|
||||
uint32_t Index = 0;
|
||||
|
||||
// Set order of hot functions based on clusters.
|
||||
for (const auto& Cluster : Clusters) {
|
||||
for (const auto FuncId : Cluster.Targets) {
|
||||
assert(Cg.Nodes[FuncId].Samples > 0);
|
||||
Cg.Funcs[FuncId]->setIndex(Index++);
|
||||
FuncAddr[FuncId] = TotalSize;
|
||||
TotalSize += Cg.Nodes[FuncId].Size;
|
||||
}
|
||||
}
|
||||
|
||||
if (opts::ReorderFunctions == BinaryFunction::RT_NONE)
|
||||
return;
|
||||
|
||||
if (opts::Verbosity == 0) {
|
||||
#ifndef NDEBUG
|
||||
if (!DebugFlag || !isCurrentDebugType("hfsort"))
|
||||
return;
|
||||
#else
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
TotalSize = 0;
|
||||
uint64_t CurPage = 0;
|
||||
uint64_t Hotfuncs = 0;
|
||||
double TotalDistance = 0;
|
||||
double TotalCalls = 0;
|
||||
double TotalCalls64B = 0;
|
||||
double TotalCalls4KB = 0;
|
||||
double TotalCalls2MB = 0;
|
||||
dbgs() << "============== page 0 ==============\n";
|
||||
for (auto& Cluster : Clusters) {
|
||||
dbgs() <<
|
||||
format("-------- density = %.3lf (%u / %u) --------\n",
|
||||
(double) Cluster.Samples / Cluster.Size,
|
||||
Cluster.Samples, Cluster.Size);
|
||||
|
||||
for (auto FuncId : Cluster.Targets) {
|
||||
if (Cg.Nodes[FuncId].Samples > 0) {
|
||||
Hotfuncs++;
|
||||
|
||||
dbgs() << "BOLT-INFO: hot func " << *Cg.Funcs[FuncId]
|
||||
<< " (" << Cg.Nodes[FuncId].Size << ")\n";
|
||||
|
||||
uint64_t Dist = 0;
|
||||
uint64_t Calls = 0;
|
||||
for (auto Dst : Cg.Nodes[FuncId].Succs) {
|
||||
auto& A = *Cg.Arcs.find(Arc(FuncId, Dst));
|
||||
auto D =
|
||||
std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset));
|
||||
auto W = A.Weight;
|
||||
Calls += W;
|
||||
if (D < 64) TotalCalls64B += W;
|
||||
if (D < 4096) TotalCalls4KB += W;
|
||||
if (D < (2 << 20)) TotalCalls2MB += W;
|
||||
Dist += A.Weight * D;
|
||||
dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
|
||||
"weight = %.0lf, callDist = %f\n",
|
||||
A.Src, FuncAddr[A.Src], A.AvgCallOffset,
|
||||
A.Dst, FuncAddr[A.Dst], A.Weight, D);
|
||||
}
|
||||
TotalCalls += Calls;
|
||||
TotalDistance += Dist;
|
||||
dbgs() << format("start = %6u : avgCallDist = %lu : %s\n",
|
||||
TotalSize,
|
||||
Calls ? Dist / Calls : 0,
|
||||
Cg.Funcs[FuncId]->getPrintName().c_str());
|
||||
TotalSize += Cg.Nodes[FuncId].Size;
|
||||
auto NewPage = TotalSize / HugePageSize;
|
||||
if (NewPage != CurPage) {
|
||||
CurPage = NewPage;
|
||||
dbgs() << format("============== page %u ==============\n", CurPage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dbgs() << format(" Number of hot functions: %u\n"
|
||||
" Number of clusters: %lu\n",
|
||||
Hotfuncs, Clusters.size())
|
||||
<< format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n",
|
||||
TotalCalls ? TotalDistance / TotalCalls : 0,
|
||||
TotalDistance, TotalCalls)
|
||||
<< format(" Total Calls = %.0lf\n", TotalCalls);
|
||||
if (TotalCalls) {
|
||||
dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
|
||||
<< format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
|
||||
<< format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n",
|
||||
TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<std::string> readFunctionOrderFile() {
|
||||
std::vector<std::string> FunctionNames;
|
||||
std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
|
||||
if (!FuncsFile) {
|
||||
errs() << "Ordered functions file \"" << opts::FunctionOrderFile
|
||||
<< "\" can't be opened.\n";
|
||||
exit(1);
|
||||
}
|
||||
std::string FuncName;
|
||||
while (std::getline(FuncsFile, FuncName)) {
|
||||
FunctionNames.push_back(FuncName);
|
||||
}
|
||||
return FunctionNames;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ReorderFunctions::runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) {
|
||||
if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) {
|
||||
errs() << "BOLT-ERROR: Function reordering only works when "
|
||||
<< "relocs are enabled.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (opts::ReorderFunctions != BinaryFunction::RT_NONE &&
|
||||
opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT &&
|
||||
opts::ReorderFunctions != BinaryFunction::RT_USER) {
|
||||
Cg = buildCallGraph(BC,
|
||||
BFs,
|
||||
[this](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF) || !BF.hasProfile();
|
||||
},
|
||||
false, // IncludeColdCalls
|
||||
opts::ReorderFunctionsUseHotSize,
|
||||
opts::UseEdgeCounts);
|
||||
normalizeArcWeights();
|
||||
}
|
||||
|
||||
std::vector<Cluster> Clusters;
|
||||
|
||||
switch(opts::ReorderFunctions) {
|
||||
case BinaryFunction::RT_NONE:
|
||||
break;
|
||||
case BinaryFunction::RT_EXEC_COUNT:
|
||||
{
|
||||
std::vector<BinaryFunction *> SortedFunctions(BFs.size());
|
||||
uint32_t Index = 0;
|
||||
std::transform(BFs.begin(),
|
||||
BFs.end(),
|
||||
SortedFunctions.begin(),
|
||||
[](std::pair<const uint64_t, BinaryFunction> &BFI) {
|
||||
return &BFI.second;
|
||||
});
|
||||
std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
|
||||
[&](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
if (!opts::shouldProcess(*A))
|
||||
return false;
|
||||
const auto PadA = opts::padFunction(*A);
|
||||
const auto PadB = opts::padFunction(*B);
|
||||
if (!PadA || !PadB) {
|
||||
if (PadA)
|
||||
return true;
|
||||
if (PadB)
|
||||
return false;
|
||||
}
|
||||
return !A->hasProfile() &&
|
||||
(B->hasProfile() ||
|
||||
(A->getExecutionCount() > B->getExecutionCount()));
|
||||
});
|
||||
for (auto *BF : SortedFunctions) {
|
||||
if (BF->hasProfile())
|
||||
BF->setIndex(Index++);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case BinaryFunction::RT_HFSORT:
|
||||
Clusters = clusterize(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_HFSORT_PLUS:
|
||||
Clusters = hfsortPlus(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_PETTIS_HANSEN:
|
||||
Clusters = pettisAndHansen(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_RANDOM:
|
||||
std::srand(opts::RandomSeed);
|
||||
Clusters = randomClusters(Cg);
|
||||
break;
|
||||
case BinaryFunction::RT_USER:
|
||||
{
|
||||
uint32_t Index = 0;
|
||||
for (const auto &Function : readFunctionOrderFile()) {
|
||||
std::vector<uint64_t> FuncAddrs;
|
||||
|
||||
auto Itr = BC.GlobalSymbols.find(Function);
|
||||
if (Itr == BC.GlobalSymbols.end()) {
|
||||
uint32_t LocalID = 1;
|
||||
while(1) {
|
||||
// If we can't find the main symbol name, look for alternates.
|
||||
Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID));
|
||||
if (Itr != BC.GlobalSymbols.end())
|
||||
FuncAddrs.push_back(Itr->second);
|
||||
else
|
||||
break;
|
||||
LocalID++;
|
||||
}
|
||||
} else {
|
||||
FuncAddrs.push_back(Itr->second);
|
||||
}
|
||||
|
||||
if (FuncAddrs.empty()) {
|
||||
errs() << "BOLT-WARNING: Reorder functions: can't find function for "
|
||||
<< Function << ".\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto FuncAddr : FuncAddrs) {
|
||||
const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat");
|
||||
assert(FuncSym);
|
||||
|
||||
auto *BF = BC.getFunctionForSymbol(FuncSym);
|
||||
if (!BF) {
|
||||
errs() << "BOLT-WARNING: Reorder functions: can't find function for "
|
||||
<< Function << ".\n";
|
||||
break;
|
||||
}
|
||||
if (!BF->hasValidIndex()) {
|
||||
BF->setIndex(Index++);
|
||||
} else if (opts::Verbosity > 0) {
|
||||
errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
reorder(std::move(Clusters), BFs);
|
||||
|
||||
if (!opts::GenerateFunctionOrderFile.empty()) {
|
||||
std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out);
|
||||
if (!FuncsFile) {
|
||||
errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile
|
||||
<< "\" can't be opened.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::vector<BinaryFunction *> SortedFunctions(BFs.size());
|
||||
|
||||
std::transform(BFs.begin(),
|
||||
BFs.end(),
|
||||
SortedFunctions.begin(),
|
||||
[](std::pair<const uint64_t, BinaryFunction> &BFI) {
|
||||
return &BFI.second;
|
||||
});
|
||||
|
||||
// Sort functions by index.
|
||||
std::stable_sort(
|
||||
SortedFunctions.begin(),
|
||||
SortedFunctions.end(),
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
if (A->hasValidIndex() && B->hasValidIndex()) {
|
||||
return A->getIndex() < B->getIndex();
|
||||
} else if (A->hasValidIndex() && !B->hasValidIndex()) {
|
||||
return true;
|
||||
} else if (!A->hasValidIndex() && B->hasValidIndex()) {
|
||||
return false;
|
||||
} else {
|
||||
return A->getAddress() < B->getAddress();
|
||||
}
|
||||
});
|
||||
|
||||
for (const auto *Func : SortedFunctions) {
|
||||
if (!Func->hasValidIndex())
|
||||
break;
|
||||
FuncsFile << Func->getSymbol()->getName().data() << "\n";
|
||||
}
|
||||
FuncsFile.close();
|
||||
|
||||
outs() << "BOLT-INFO: dumped function order to \""
|
||||
<< opts::GenerateFunctionOrderFile << "\"\n";
|
||||
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
|
@ -0,0 +1,43 @@
|
|||
//===--- ReorderFunctions.h - Function reordering pass --------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "HFSort.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
/// Modify function order for streaming based on hotness.
|
||||
class ReorderFunctions : public BinaryFunctionPass {
|
||||
CallGraph Cg;
|
||||
|
||||
void normalizeArcWeights();
|
||||
void reorder(std::vector<Cluster> &&Clusters,
|
||||
std::map<uint64_t, BinaryFunction> &BFs);
|
||||
public:
|
||||
explicit ReorderFunctions(const cl::opt<bool> &PrintPass)
|
||||
: BinaryFunctionPass(PrintPass) { }
|
||||
|
||||
const char *getName() const override {
|
||||
return "reorder-functions";
|
||||
}
|
||||
void runOnFunctions(BinaryContext &BC,
|
||||
std::map<uint64_t, BinaryFunction> &BFs,
|
||||
std::set<uint64_t> &LargeFunctions) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue