forked from OSchip/llvm-project
Parallelize ICF Pass
Summary: ICF consumes 10-15% of bolt runtime, for HHVM that is around 45 seconds. this diff perform some parallelization for the pass to make it faster. A 60% reduction in the ICF runtime is measured on the parallel version for HHVM. (cherry picked from FBD15589515)
This commit is contained in:
parent
9894de0094
commit
1ec091e6f5
|
@ -810,18 +810,30 @@ void BinaryContext::postProcessSymbolTable() {
|
|||
}
|
||||
|
||||
void BinaryContext::foldFunction(BinaryFunction &ChildBF,
|
||||
BinaryFunction &ParentBF) {
|
||||
BinaryFunction &ParentBF) {
|
||||
std::shared_lock<std::shared_timed_mutex> ReadCtxLock(CtxMutex,
|
||||
std::defer_lock);
|
||||
std::unique_lock<std::shared_timed_mutex> WriteCtxLock(CtxMutex,
|
||||
std::defer_lock);
|
||||
std::unique_lock<std::shared_timed_mutex> WriteSymbolMapLock(
|
||||
SymbolToFunctionMapMutex, std::defer_lock);
|
||||
|
||||
// Copy name list.
|
||||
ParentBF.addNewNames(ChildBF.getNames());
|
||||
|
||||
// Update internal bookkeeping info.
|
||||
for (auto &Name : ChildBF.getNames()) {
|
||||
ReadCtxLock.lock();
|
||||
// Calls to functions are handled via symbols, and we keep the lookup table
|
||||
// that we need to update.
|
||||
auto *Symbol = Ctx->lookupSymbol(Name);
|
||||
assert(Symbol && "symbol cannot be NULL at this point");
|
||||
SymbolToFunctionMap[Symbol] = &ParentBF;
|
||||
ReadCtxLock.unlock();
|
||||
|
||||
assert(Symbol && "symbol cannot be NULL at this point");
|
||||
|
||||
WriteSymbolMapLock.lock();
|
||||
SymbolToFunctionMap[Symbol] = &ParentBF;
|
||||
WriteSymbolMapLock.unlock();
|
||||
// NB: there's no need to update BinaryDataMap and GlobalSymbols.
|
||||
}
|
||||
|
||||
|
@ -829,17 +841,32 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF,
|
|||
ChildBF.mergeProfileDataInto(ParentBF);
|
||||
|
||||
if (HasRelocations) {
|
||||
std::shared_lock<std::shared_timed_mutex> ReadBfsLock(BinaryFunctionsMutex,
|
||||
std::defer_lock);
|
||||
std::unique_lock<std::shared_timed_mutex> WriteBfsLock(BinaryFunctionsMutex,
|
||||
std::defer_lock);
|
||||
// Remove ChildBF from the global set of functions in relocs mode.
|
||||
ReadBfsLock.lock();
|
||||
auto FI = BinaryFunctions.find(ChildBF.getAddress());
|
||||
ReadBfsLock.unlock();
|
||||
|
||||
assert(FI != BinaryFunctions.end() && "function not found");
|
||||
assert(&ChildBF == &FI->second && "function mismatch");
|
||||
|
||||
WriteBfsLock.lock();
|
||||
FI = BinaryFunctions.erase(FI);
|
||||
WriteBfsLock.unlock();
|
||||
|
||||
} else {
|
||||
// In non-relocation mode we keep the function, but rename it.
|
||||
std::string NewName = "__ICF_" + ChildBF.Names.back();
|
||||
std::string NewName = "__ICF_" + ChildBF.getSymbol()->getName().str();
|
||||
ChildBF.Names.clear();
|
||||
ChildBF.Names.push_back(NewName);
|
||||
|
||||
WriteCtxLock.lock();
|
||||
ChildBF.OutputSymbol = Ctx->getOrCreateSymbol(NewName);
|
||||
WriteCtxLock.unlock();
|
||||
|
||||
ChildBF.setFolded();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#include <functional>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <shared_mutex>
|
||||
#include <string>
|
||||
#include <system_error>
|
||||
#include <unordered_map>
|
||||
|
@ -149,6 +150,9 @@ class BinaryContext {
|
|||
/// Store all functions in the binary, sorted by original address.
|
||||
std::map<uint64_t, BinaryFunction> BinaryFunctions;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to BinaryFunctions
|
||||
mutable std::shared_timed_mutex BinaryFunctionsMutex;
|
||||
|
||||
/// Functions injected by BOLT
|
||||
std::vector<BinaryFunction *> InjectedBinaryFunctions;
|
||||
|
||||
|
@ -229,6 +233,9 @@ public:
|
|||
std::unordered_map<const MCSymbol *,
|
||||
BinaryFunction *> SymbolToFunctionMap;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to SymbolToFunctionMap
|
||||
mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
|
||||
|
||||
/// Look up the symbol entry that contains the given \p Address (based on
|
||||
/// the start address and size for each symbol). Returns a pointer to
|
||||
/// the BinaryData for that symbol. If no data is found, nullptr is returned.
|
||||
|
@ -332,6 +339,9 @@ public:
|
|||
|
||||
std::unique_ptr<MCContext> Ctx;
|
||||
|
||||
/// A mutex that is used to control parallel accesses to Ctx
|
||||
mutable std::shared_timed_mutex CtxMutex;
|
||||
|
||||
std::unique_ptr<DWARFContext> DwCtx;
|
||||
|
||||
std::unique_ptr<Triple> TheTriple;
|
||||
|
@ -815,7 +825,7 @@ public:
|
|||
|
||||
/// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
|
||||
/// removed from the list of functions \p BFs. The profile data of \p ChildBF
|
||||
/// is merged into that of \p ParentBF.
|
||||
/// is merged into that of \p ParentBF. This function is thread safe.
|
||||
void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);
|
||||
|
||||
/// Add a Section relocation at a given \p Address.
|
||||
|
@ -829,12 +839,15 @@ public:
|
|||
/// is no relocation at such address.
|
||||
const Relocation *getRelocationAt(uint64_t Address);
|
||||
|
||||
/// This function is thread safe.
|
||||
const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const {
|
||||
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
|
||||
auto BFI = SymbolToFunctionMap.find(Symbol);
|
||||
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
|
||||
}
|
||||
|
||||
BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) {
|
||||
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
|
||||
auto BFI = SymbolToFunctionMap.find(Symbol);
|
||||
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace llvm::object;
|
||||
|
||||
|
@ -1307,7 +1308,8 @@ public:
|
|||
/// Add new names this function is known under.
|
||||
template <class ContainterTy>
|
||||
void addNewNames(const ContainterTy &NewNames) {
|
||||
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
|
||||
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
|
||||
std::sort(Names.begin(), Names.end());
|
||||
}
|
||||
|
||||
/// Create a basic block at a given \p Offset in the
|
||||
|
|
|
@ -9,9 +9,11 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#include "Passes/IdenticalCodeFolding.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
@ -24,6 +26,8 @@ using namespace bolt;
|
|||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<int> ThreadCount;
|
||||
extern cl::opt<int> NoThreads;
|
||||
|
||||
static cl::opt<bool>
|
||||
UseDFS("icf-dfs",
|
||||
|
@ -31,7 +35,13 @@ UseDFS("icf-dfs",
|
|||
cl::ReallyHidden,
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
|
||||
static cl::opt<bool>
|
||||
TimeICF("time-icf",
|
||||
cl::desc("time icf steps"),
|
||||
cl::ReallyHidden,
|
||||
cl::ZeroOrMore,
|
||||
cl::cat(BoltOptCategory));
|
||||
} // namespace opts
|
||||
|
||||
namespace {
|
||||
|
@ -276,70 +286,133 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,
|
|||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// This hash table is used to identify identical functions. It maps
|
||||
// a function to a bucket of functions identical to it.
|
||||
struct KeyHash {
|
||||
std::size_t operator()(const BinaryFunction *F) const {
|
||||
return F->hash(/*Recompute=*/false);
|
||||
}
|
||||
};
|
||||
|
||||
struct KeyCongruent {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
if (A == B)
|
||||
return true;
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
|
||||
struct KeyEqual {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
if (A == B)
|
||||
return true;
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
|
||||
KeyHash, KeyCongruent>
|
||||
CongruentBucketsMap;
|
||||
|
||||
typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
|
||||
KeyHash, KeyEqual>
|
||||
IdenticalBucketsMap;
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
|
||||
const auto OriginalFunctionCount = BC.getBinaryFunctions().size();
|
||||
uint64_t NumFunctionsFolded = 0;
|
||||
uint64_t NumJTFunctionsFolded = 0;
|
||||
uint64_t BytesSavedEstimate = 0;
|
||||
uint64_t CallsSavedEstimate = 0;
|
||||
uint64_t NumFunctionsFolded{0};
|
||||
std::atomic<uint64_t> NumJTFunctionsFolded{0};
|
||||
std::atomic<uint64_t> BytesSavedEstimate{0};
|
||||
std::atomic<uint64_t> CallsSavedEstimate{0};
|
||||
std::atomic<uint64_t> NumFoldedLastIteration{0};
|
||||
CongruentBucketsMap CongruentBuckets;
|
||||
std::unique_ptr<ThreadPool> ThPool;
|
||||
if (!opts::NoThreads)
|
||||
ThPool = std::make_unique<ThreadPool>(opts::ThreadCount);
|
||||
|
||||
// This hash table is used to identify identical functions. It maps
|
||||
// a function to a bucket of functions identical to it.
|
||||
struct KeyHash {
|
||||
std::size_t operator()(const BinaryFunction *F) const {
|
||||
return F->hash(/*Recompute=*/false);
|
||||
// Hash all the functions
|
||||
auto hashFunctions = [&]() {
|
||||
NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
|
||||
"ICF breakdown", opts::TimeICF);
|
||||
|
||||
// Perform hashing for a block of functions
|
||||
auto hashBlock =
|
||||
[&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
|
||||
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
|
||||
Timer T("hash block", "hash block");
|
||||
DEBUG(T.startTimer());
|
||||
|
||||
for (auto It = BlockBegin; It != BlockEnd; ++It) {
|
||||
auto &BF = It->second;
|
||||
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
|
||||
continue;
|
||||
// Make sure indices are in-order.
|
||||
BF.updateLayoutIndices();
|
||||
|
||||
// Pre-compute hash before pushing into hashtable.
|
||||
BF.hash(/*Recompute=*/true, opts::UseDFS);
|
||||
}
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
if (opts::NoThreads) {
|
||||
hashBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
|
||||
return;
|
||||
}
|
||||
};
|
||||
struct KeyCongruent {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
|
||||
}
|
||||
};
|
||||
struct KeyEqual {
|
||||
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
|
||||
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
|
||||
|
||||
const unsigned BlockSize = OriginalFunctionCount / (2 * opts::ThreadCount);
|
||||
unsigned Counter = 0;
|
||||
auto BlockBegin = BC.getBinaryFunctions().begin();
|
||||
|
||||
for (auto It = BC.getBinaryFunctions().begin();
|
||||
It != BC.getBinaryFunctions().end(); ++It, ++Counter) {
|
||||
if (Counter >= BlockSize) {
|
||||
ThPool->async(hashBlock, BlockBegin, std::next(It));
|
||||
BlockBegin = std::next(It);
|
||||
Counter = 0;
|
||||
}
|
||||
}
|
||||
ThPool->async(hashBlock, BlockBegin, BC.getBinaryFunctions().end());
|
||||
|
||||
ThPool->wait();
|
||||
};
|
||||
|
||||
// Create buckets with congruent functions - functions that potentially could
|
||||
// be folded.
|
||||
std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
|
||||
KeyHash, KeyCongruent> CongruentBuckets;
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
|
||||
continue;
|
||||
|
||||
// Make sure indices are in-order.
|
||||
BF.updateLayoutIndices();
|
||||
|
||||
// Pre-compute hash before pushing into hashtable.
|
||||
BF.hash(/*Recompute=*/true, opts::UseDFS);
|
||||
|
||||
CongruentBuckets[&BF].emplace(&BF);
|
||||
}
|
||||
|
||||
// We repeat the pass until no new modifications happen.
|
||||
unsigned Iteration = 1;
|
||||
uint64_t NumFoldedLastIteration;
|
||||
do {
|
||||
NumFoldedLastIteration = 0;
|
||||
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
|
||||
|
||||
for (auto &CBI : CongruentBuckets) {
|
||||
auto &Candidates = CBI.second;
|
||||
if (Candidates.size() < 2)
|
||||
// Creates buckets with congruent functions - functions that potentially
|
||||
// could be folded.
|
||||
auto createCongruentBuckets = [&]() {
|
||||
NamedRegionTimer CongruentBucketsTimer("congruent buckets",
|
||||
"congruent buckets", "ICF breakdown",
|
||||
"ICF breakdown", opts::TimeICF);
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
|
||||
continue;
|
||||
CongruentBuckets[&BF].emplace(&BF);
|
||||
}
|
||||
};
|
||||
|
||||
// Partition each set of congruent functions into sets of identical functions
|
||||
// and fold them
|
||||
auto performFoldingPass = [&]() {
|
||||
NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
|
||||
"ICF breakdown", "ICF breakdown",
|
||||
opts::TimeICF);
|
||||
Timer SinglePass("single fold pass", "single fold pass");
|
||||
DEBUG(SinglePass.startTimer());
|
||||
|
||||
// Perform the work for a single congruent list
|
||||
auto performFoldingForItem = [&](std::set<BinaryFunction *> &Candidates) {
|
||||
Timer T("folding single congruent list", "folding single congruent list");
|
||||
DEBUG(T.startTimer());
|
||||
|
||||
// Identical functions go into the same bucket.
|
||||
std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
|
||||
KeyHash, KeyEqual> IdenticalBuckets;
|
||||
IdenticalBucketsMap IdenticalBuckets;
|
||||
for (auto *BF : Candidates) {
|
||||
IdenticalBuckets[BF].emplace_back(BF);
|
||||
}
|
||||
|
@ -353,9 +426,9 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
|
|||
// Fold functions. Keep the order consistent across invocations with
|
||||
// different options.
|
||||
std::stable_sort(Twins.begin(), Twins.end(),
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
return A->getFunctionNumber() < B->getFunctionNumber();
|
||||
});
|
||||
[](const BinaryFunction *A, const BinaryFunction *B) {
|
||||
return A->getFunctionNumber() < B->getFunctionNumber();
|
||||
});
|
||||
|
||||
BinaryFunction *ParentBF = Twins[0];
|
||||
for (unsigned i = 1; i < Twins.size(); ++i) {
|
||||
|
@ -382,13 +455,44 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
|
|||
}
|
||||
}
|
||||
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
// Create a task for each congruent list
|
||||
for (auto &Entry : CongruentBuckets) {
|
||||
auto &Candidates = Entry.second;
|
||||
if (Candidates.size() < 2)
|
||||
continue;
|
||||
|
||||
if (opts::NoThreads)
|
||||
performFoldingForItem(Candidates);
|
||||
else
|
||||
ThPool->async(performFoldingForItem, std::ref(Candidates));
|
||||
}
|
||||
if (opts::NoThreads)
|
||||
return;
|
||||
|
||||
ThPool->wait();
|
||||
DEBUG(SinglePass.stopTimer());
|
||||
};
|
||||
|
||||
hashFunctions();
|
||||
createCongruentBuckets();
|
||||
|
||||
unsigned Iteration = 1;
|
||||
// We repeat the pass until no new modifications happen.
|
||||
do {
|
||||
NumFoldedLastIteration = 0;
|
||||
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
|
||||
|
||||
performFoldingPass();
|
||||
|
||||
NumFunctionsFolded += NumFoldedLastIteration;
|
||||
++Iteration;
|
||||
|
||||
} while (NumFoldedLastIteration > 0);
|
||||
|
||||
DEBUG(
|
||||
DEBUG(
|
||||
// Print functions that are congruent but not identical.
|
||||
for (auto &CBI : CongruentBuckets) {
|
||||
auto &Candidates = CBI.second;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "llvm/Support/Signals.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
|
||||
#undef DEBUG_TYPE
|
||||
#define DEBUG_TYPE "bolt"
|
||||
|
@ -104,8 +105,19 @@ PerfDataA("p",
|
|||
cl::aliasopt(PerfData),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
} // namespace opts
|
||||
cl::opt<int>
|
||||
ThreadCount("thread-count",
|
||||
cl::desc("number of threads"),
|
||||
cl::init(hardware_concurrency()),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool>
|
||||
NoThreads("no-threads",
|
||||
cl::desc("disbale multithreading"),
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
static StringRef ToolName;
|
||||
|
||||
static void report_error(StringRef Message, std::error_code EC) {
|
||||
|
|
Loading…
Reference in New Issue