Parallelize ICF Pass

Summary:
ICF consumes 10-15% of bolt runtime, for HHVM that is around 45 seconds.
this diff perform some parallelization for the pass to make it faster.
A 60% reduction in the ICF runtime  is measured on the parallel version for HHVM.

(cherry picked from FBD15589515)
This commit is contained in:
laith sakka 2019-05-31 16:45:31 -07:00 committed by Maksim Panchenko
parent 9894de0094
commit 1ec091e6f5
5 changed files with 221 additions and 63 deletions

View File

@ -810,18 +810,30 @@ void BinaryContext::postProcessSymbolTable() {
}
void BinaryContext::foldFunction(BinaryFunction &ChildBF,
BinaryFunction &ParentBF) {
BinaryFunction &ParentBF) {
std::shared_lock<std::shared_timed_mutex> ReadCtxLock(CtxMutex,
std::defer_lock);
std::unique_lock<std::shared_timed_mutex> WriteCtxLock(CtxMutex,
std::defer_lock);
std::unique_lock<std::shared_timed_mutex> WriteSymbolMapLock(
SymbolToFunctionMapMutex, std::defer_lock);
// Copy name list.
ParentBF.addNewNames(ChildBF.getNames());
// Update internal bookkeeping info.
for (auto &Name : ChildBF.getNames()) {
ReadCtxLock.lock();
// Calls to functions are handled via symbols, and we keep the lookup table
// that we need to update.
auto *Symbol = Ctx->lookupSymbol(Name);
assert(Symbol && "symbol cannot be NULL at this point");
SymbolToFunctionMap[Symbol] = &ParentBF;
ReadCtxLock.unlock();
assert(Symbol && "symbol cannot be NULL at this point");
WriteSymbolMapLock.lock();
SymbolToFunctionMap[Symbol] = &ParentBF;
WriteSymbolMapLock.unlock();
// NB: there's no need to update BinaryDataMap and GlobalSymbols.
}
@ -829,17 +841,32 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF,
ChildBF.mergeProfileDataInto(ParentBF);
if (HasRelocations) {
std::shared_lock<std::shared_timed_mutex> ReadBfsLock(BinaryFunctionsMutex,
std::defer_lock);
std::unique_lock<std::shared_timed_mutex> WriteBfsLock(BinaryFunctionsMutex,
std::defer_lock);
// Remove ChildBF from the global set of functions in relocs mode.
ReadBfsLock.lock();
auto FI = BinaryFunctions.find(ChildBF.getAddress());
ReadBfsLock.unlock();
assert(FI != BinaryFunctions.end() && "function not found");
assert(&ChildBF == &FI->second && "function mismatch");
WriteBfsLock.lock();
FI = BinaryFunctions.erase(FI);
WriteBfsLock.unlock();
} else {
// In non-relocation mode we keep the function, but rename it.
std::string NewName = "__ICF_" + ChildBF.Names.back();
std::string NewName = "__ICF_" + ChildBF.getSymbol()->getName().str();
ChildBF.Names.clear();
ChildBF.Names.push_back(NewName);
WriteCtxLock.lock();
ChildBF.OutputSymbol = Ctx->getOrCreateSymbol(NewName);
WriteCtxLock.unlock();
ChildBF.setFolded();
}
}

View File

@ -43,6 +43,7 @@
#include <functional>
#include <map>
#include <set>
#include <shared_mutex>
#include <string>
#include <system_error>
#include <unordered_map>
@ -149,6 +150,9 @@ class BinaryContext {
/// Store all functions in the binary, sorted by original address.
std::map<uint64_t, BinaryFunction> BinaryFunctions;
/// A mutex that is used to control parallel accesses to BinaryFunctions
mutable std::shared_timed_mutex BinaryFunctionsMutex;
/// Functions injected by BOLT
std::vector<BinaryFunction *> InjectedBinaryFunctions;
@ -229,6 +233,9 @@ public:
std::unordered_map<const MCSymbol *,
BinaryFunction *> SymbolToFunctionMap;
/// A mutex that is used to control parallel accesses to SymbolToFunctionMap
mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
/// Look up the symbol entry that contains the given \p Address (based on
/// the start address and size for each symbol). Returns a pointer to
/// the BinaryData for that symbol. If no data is found, nullptr is returned.
@ -332,6 +339,9 @@ public:
std::unique_ptr<MCContext> Ctx;
/// A mutex that is used to control parallel accesses to Ctx
mutable std::shared_timed_mutex CtxMutex;
std::unique_ptr<DWARFContext> DwCtx;
std::unique_ptr<Triple> TheTriple;
@ -815,7 +825,7 @@ public:
/// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
/// removed from the list of functions \p BFs. The profile data of \p ChildBF
/// is merged into that of \p ParentBF.
/// is merged into that of \p ParentBF. This function is thread safe.
void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);
/// Add a Section relocation at a given \p Address.
@ -829,12 +839,15 @@ public:
/// is no relocation at such address.
const Relocation *getRelocationAt(uint64_t Address);
/// This function is thread safe.
const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const {
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
auto BFI = SymbolToFunctionMap.find(Symbol);
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
}
BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) {
std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
auto BFI = SymbolToFunctionMap.find(Symbol);
return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second;
}

View File

@ -41,6 +41,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <algorithm>
using namespace llvm::object;
@ -1307,7 +1308,8 @@ public:
/// Add new names this function is known under.
template <class ContainterTy>
void addNewNames(const ContainterTy &NewNames) {
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
Names.insert(Names.begin(), NewNames.begin(), NewNames.end());
std::sort(Names.begin(), Names.end());
}
/// Create a basic block at a given \p Offset in the

View File

@ -9,9 +9,11 @@
//
//===----------------------------------------------------------------------===//
#include "Passes/IdenticalCodeFolding.h"
#include "llvm/Support/Options.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/Timer.h"
#include <atomic>
#include <map>
#include <set>
#include <unordered_map>
@ -24,6 +26,8 @@ using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<int> ThreadCount;
extern cl::opt<int> NoThreads;
static cl::opt<bool>
UseDFS("icf-dfs",
@ -31,7 +35,13 @@ UseDFS("icf-dfs",
cl::ReallyHidden,
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
TimeICF("time-icf",
cl::desc("time icf steps"),
cl::ReallyHidden,
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
} // namespace opts
namespace {
@ -276,70 +286,133 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,
return true;
}
}
// This hash table is used to identify identical functions. It maps
// a function to a bucket of functions identical to it.
struct KeyHash {
std::size_t operator()(const BinaryFunction *F) const {
return F->hash(/*Recompute=*/false);
}
};
struct KeyCongruent {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
if (A == B)
return true;
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
}
};
struct KeyEqual {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
if (A == B)
return true;
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
}
};
typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
KeyHash, KeyCongruent>
CongruentBucketsMap;
typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
KeyHash, KeyEqual>
IdenticalBucketsMap;
} // namespace
namespace llvm {
namespace bolt {
void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
const auto OriginalFunctionCount = BC.getBinaryFunctions().size();
uint64_t NumFunctionsFolded = 0;
uint64_t NumJTFunctionsFolded = 0;
uint64_t BytesSavedEstimate = 0;
uint64_t CallsSavedEstimate = 0;
uint64_t NumFunctionsFolded{0};
std::atomic<uint64_t> NumJTFunctionsFolded{0};
std::atomic<uint64_t> BytesSavedEstimate{0};
std::atomic<uint64_t> CallsSavedEstimate{0};
std::atomic<uint64_t> NumFoldedLastIteration{0};
CongruentBucketsMap CongruentBuckets;
std::unique_ptr<ThreadPool> ThPool;
if (!opts::NoThreads)
ThPool = std::make_unique<ThreadPool>(opts::ThreadCount);
// This hash table is used to identify identical functions. It maps
// a function to a bucket of functions identical to it.
struct KeyHash {
std::size_t operator()(const BinaryFunction *F) const {
return F->hash(/*Recompute=*/false);
// Hash all the functions
auto hashFunctions = [&]() {
NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
"ICF breakdown", opts::TimeICF);
// Perform hashing for a block of functions
auto hashBlock =
[&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
Timer T("hash block", "hash block");
DEBUG(T.startTimer());
for (auto It = BlockBegin; It != BlockEnd; ++It) {
auto &BF = It->second;
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
continue;
// Make sure indices are in-order.
BF.updateLayoutIndices();
// Pre-compute hash before pushing into hashtable.
BF.hash(/*Recompute=*/true, opts::UseDFS);
}
DEBUG(T.stopTimer());
};
if (opts::NoThreads) {
hashBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
return;
}
};
struct KeyCongruent {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS);
}
};
struct KeyEqual {
bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS);
const unsigned BlockSize = OriginalFunctionCount / (2 * opts::ThreadCount);
unsigned Counter = 0;
auto BlockBegin = BC.getBinaryFunctions().begin();
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It, ++Counter) {
if (Counter >= BlockSize) {
ThPool->async(hashBlock, BlockBegin, std::next(It));
BlockBegin = std::next(It);
Counter = 0;
}
}
ThPool->async(hashBlock, BlockBegin, BC.getBinaryFunctions().end());
ThPool->wait();
};
// Create buckets with congruent functions - functions that potentially could
// be folded.
std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
KeyHash, KeyCongruent> CongruentBuckets;
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
continue;
// Make sure indices are in-order.
BF.updateLayoutIndices();
// Pre-compute hash before pushing into hashtable.
BF.hash(/*Recompute=*/true, opts::UseDFS);
CongruentBuckets[&BF].emplace(&BF);
}
// We repeat the pass until no new modifications happen.
unsigned Iteration = 1;
uint64_t NumFoldedLastIteration;
do {
NumFoldedLastIteration = 0;
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
for (auto &CBI : CongruentBuckets) {
auto &Candidates = CBI.second;
if (Candidates.size() < 2)
// Creates buckets with congruent functions - functions that potentially
// could be folded.
auto createCongruentBuckets = [&]() {
NamedRegionTimer CongruentBucketsTimer("congruent buckets",
"congruent buckets", "ICF breakdown",
"ICF breakdown", opts::TimeICF);
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
if (!shouldOptimize(BF) || BF.isFolded() || BF.hasSDTMarker())
continue;
CongruentBuckets[&BF].emplace(&BF);
}
};
// Partition each set of congruent functions into sets of identical functions
// and fold them
auto performFoldingPass = [&]() {
NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
"ICF breakdown", "ICF breakdown",
opts::TimeICF);
Timer SinglePass("single fold pass", "single fold pass");
DEBUG(SinglePass.startTimer());
// Perform the work for a single congruent list
auto performFoldingForItem = [&](std::set<BinaryFunction *> &Candidates) {
Timer T("folding single congruent list", "folding single congruent list");
DEBUG(T.startTimer());
// Identical functions go into the same bucket.
std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
KeyHash, KeyEqual> IdenticalBuckets;
IdenticalBucketsMap IdenticalBuckets;
for (auto *BF : Candidates) {
IdenticalBuckets[BF].emplace_back(BF);
}
@ -353,9 +426,9 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
// Fold functions. Keep the order consistent across invocations with
// different options.
std::stable_sort(Twins.begin(), Twins.end(),
[](const BinaryFunction *A, const BinaryFunction *B) {
return A->getFunctionNumber() < B->getFunctionNumber();
});
[](const BinaryFunction *A, const BinaryFunction *B) {
return A->getFunctionNumber() < B->getFunctionNumber();
});
BinaryFunction *ParentBF = Twins[0];
for (unsigned i = 1; i < Twins.size(); ++i) {
@ -382,13 +455,44 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
}
}
DEBUG(T.stopTimer());
};
// Create a task for each congruent list
for (auto &Entry : CongruentBuckets) {
auto &Candidates = Entry.second;
if (Candidates.size() < 2)
continue;
if (opts::NoThreads)
performFoldingForItem(Candidates);
else
ThPool->async(performFoldingForItem, std::ref(Candidates));
}
if (opts::NoThreads)
return;
ThPool->wait();
DEBUG(SinglePass.stopTimer());
};
hashFunctions();
createCongruentBuckets();
unsigned Iteration = 1;
// We repeat the pass until no new modifications happen.
do {
NumFoldedLastIteration = 0;
DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
performFoldingPass();
NumFunctionsFolded += NumFoldedLastIteration;
++Iteration;
} while (NumFoldedLastIteration > 0);
DEBUG(
DEBUG(
// Print functions that are congruent but not identical.
for (auto &CBI : CongruentBuckets) {
auto &Candidates = CBI.second;

View File

@ -23,6 +23,7 @@
#include "llvm/Support/Signals.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/ThreadPool.h"
#undef DEBUG_TYPE
#define DEBUG_TYPE "bolt"
@ -104,8 +105,19 @@ PerfDataA("p",
cl::aliasopt(PerfData),
cl::cat(AggregatorCategory));
} // namespace opts
cl::opt<int>
ThreadCount("thread-count",
cl::desc("number of threads"),
cl::init(hardware_concurrency()),
cl::cat(BoltCategory));
cl::opt<bool>
NoThreads("no-threads",
cl::desc("disbale multithreading"),
cl::init(false),
cl::cat(BoltCategory));
} // namespace opts
static StringRef ToolName;
static void report_error(StringRef Message, std::error_code EC) {