forked from OSchip/llvm-project
Create a general interface to implement parallel tasks easily and apply it to run EliminateUnreachableBlocks in parallel.
Summary: Each time we run some work in parallel over the list of functions in bolt, we manage a thread pool, task scheduling and perform some work to manage the granularity of the tasks based on the type of the work we do. In this task, I am creating an interface where all those details are abstracted out, the user provides the function that will run on each function, and some policy parameters that setup the scheduling and granularity configurations. This will make it easier to implement parallel tasks, and eliminate redundant coding efforts. (cherry picked from FBD16116077)
This commit is contained in:
parent
f10d1fe0f3
commit
3cfc76cdbf
|
@ -83,6 +83,7 @@ add_llvm_tool(llvm-bolt
|
|||
Heatmap.cpp
|
||||
JumpTable.cpp
|
||||
MCPlusBuilder.cpp
|
||||
ParallelUtilities.cpp
|
||||
ProfileReader.cpp
|
||||
ProfileWriter.cpp
|
||||
Relocation.cpp
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
//===--- ParallelUtilities.cpp -------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
|
||||
#define DEBUG_TYPE "par-utils"
|
||||
|
||||
|
||||
namespace opts {
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
cl::opt<unsigned>
|
||||
ThreadCount("thread-count",
|
||||
cl::desc("number of threads"),
|
||||
cl::init(hardware_concurrency()),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool>
|
||||
NoThreads("no-threads",
|
||||
cl::desc("disable multithreading"),
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<unsigned>
|
||||
TaskCount("tasks-per-thread",
|
||||
cl::desc("number of tasks to be created per thread"),
|
||||
cl::init(20),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// A single thread pool that is used to run parallel tasks
|
||||
std::unique_ptr<ThreadPool> ThPoolPtr;
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
namespace ParallelUtilities {
|
||||
|
||||
ThreadPool &getThreadPool() {
|
||||
if (ThPoolPtr.get())
|
||||
return *ThPoolPtr;
|
||||
|
||||
ThPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
|
||||
return *ThPoolPtr;
|
||||
}
|
||||
|
||||
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
|
||||
std::string LogName, unsigned TasksPerThread) {
|
||||
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
|
||||
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
|
||||
Timer T(LogName, LogName);
|
||||
DEBUG(T.startTimer());
|
||||
|
||||
for (auto It = BlockBegin; It != BlockEnd; ++It) {
|
||||
auto &BF = It->second;
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
continue;
|
||||
|
||||
WorkFunction(BF);
|
||||
}
|
||||
DEBUG(T.stopTimer());
|
||||
};
|
||||
|
||||
if (opts::NoThreads) {
|
||||
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
|
||||
return;
|
||||
}
|
||||
|
||||
// Estimate the overall runtime cost using the scheduling policy
|
||||
unsigned TotalCost = 0;
|
||||
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
|
||||
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL) {
|
||||
TotalCost = BC.getBinaryFunctions().size();
|
||||
} else {
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
auto &BF = BFI.second;
|
||||
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
continue;
|
||||
|
||||
if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
|
||||
TotalCost++;
|
||||
else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
|
||||
TotalCost += BF.size();
|
||||
else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
|
||||
TotalCost += BF.size() * BF.size();
|
||||
}
|
||||
}
|
||||
|
||||
// Divide work into blocks of equal cost
|
||||
ThreadPool &ThPool = getThreadPool();
|
||||
const unsigned BlockCost = TotalCost / BlocksCount;
|
||||
auto BlockBegin = BC.getBinaryFunctions().begin();
|
||||
unsigned CurrentCost = 0;
|
||||
|
||||
for (auto It = BC.getBinaryFunctions().begin();
|
||||
It != BC.getBinaryFunctions().end(); ++It) {
|
||||
auto &BF = It->second;
|
||||
|
||||
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
|
||||
CurrentCost++;
|
||||
else {
|
||||
if (SkipPredicate && SkipPredicate(BF))
|
||||
continue;
|
||||
|
||||
if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
|
||||
CurrentCost++;
|
||||
else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
|
||||
CurrentCost += BF.size();
|
||||
else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
|
||||
CurrentCost += BF.size() * BF.size();
|
||||
}
|
||||
|
||||
if (CurrentCost >= BlockCost) {
|
||||
ThPool.async(runBlock, BlockBegin, std::next(It));
|
||||
BlockBegin = std::next(It);
|
||||
CurrentCost = 0;
|
||||
}
|
||||
}
|
||||
ThPool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
|
||||
ThPool.wait();
|
||||
}
|
||||
} // namespace ParallelUtilities
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
|
@ -0,0 +1,59 @@
|
|||
//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This class creates an interface that can be used to run parallel tasks that
|
||||
// operate on functions. Several scheduling criteria are supported using
|
||||
// SchedulingPolicy, and are defined by how the runtime cost should be
|
||||
// estimated.
|
||||
// If the NoThreads flags is passed, work will execute sequentially.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
|
||||
#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
|
||||
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace opts {
|
||||
extern cl::opt<unsigned> ThreadCount;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
extern cl::opt<unsigned> TaskCount;
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
namespace ParallelUtilities {
|
||||
|
||||
using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
|
||||
using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
|
||||
|
||||
enum SchedulingPolicy {
|
||||
SP_TRIVIAL, /// cost is estimated by the number of functions
|
||||
SP_CONSTANT, /// cost is estimated by the number of non-skipped functions
|
||||
SP_LINEAR, /// cost is estimated by the size of non-skipped functions
|
||||
SP_QUADRATIC /// cost is estimated by the square of the size of non-skipped
|
||||
/// functions
|
||||
};
|
||||
|
||||
/// Return the managed threadpool and initialize it if not intiliazed
|
||||
ThreadPool &getThreadPool();
|
||||
|
||||
// Perform the work on each binary function, except those that are accepted
|
||||
// by the SkipPredicate, scheduling heuristic is based on SchedPolicy
|
||||
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
|
||||
WorkFuncTy WorkFunction,
|
||||
PredicateTy SkipPredicate = PredicateTy(),
|
||||
std::string LogName = "",
|
||||
unsigned TasksPerThread = opts::TaskCount);
|
||||
} // namespace ParallelUtilities
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
#endif
|
|
@ -10,6 +10,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "BinaryPasses.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "Passes/ReorderAlgorithm.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include <numeric>
|
||||
|
@ -294,7 +295,10 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
|
|||
DeletedBlocks += Count;
|
||||
DeletedBytes += Bytes;
|
||||
if (Count) {
|
||||
Modified.insert(&Function);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(ModifiedMtx);
|
||||
Modified.insert(&Function);
|
||||
}
|
||||
if (opts::Verbosity > 0) {
|
||||
outs() << "BOLT-INFO: Removed " << Count
|
||||
<< " dead basic block(s) accounting for " << Bytes
|
||||
|
@ -305,12 +309,18 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
|
|||
}
|
||||
|
||||
void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
if (shouldOptimize(Function)) {
|
||||
runOnFunction(Function);
|
||||
}
|
||||
}
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
runOnFunction(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
|
||||
"EliminateUnreachableBlocks");
|
||||
|
||||
outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
|
||||
<< DeletedBytes << " bytes of code.\n";
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include "DynoStats.h"
|
||||
#include "HFSort.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
@ -95,9 +95,10 @@ public:
|
|||
/// Detect and eliminate unreachable basic blocks. We could have those
|
||||
/// filled with nops and they are used for alignment.
|
||||
class EliminateUnreachableBlocks : public BinaryFunctionPass {
|
||||
std::shared_timed_mutex ModifiedMtx;
|
||||
std::unordered_set<const BinaryFunction *> Modified;
|
||||
unsigned DeletedBlocks{0};
|
||||
uint64_t DeletedBytes{0};
|
||||
std::atomic<unsigned> DeletedBlocks{0};
|
||||
std::atomic<uint64_t> DeletedBytes{0};
|
||||
void runOnFunction(BinaryFunction& Function);
|
||||
public:
|
||||
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
#include "FrameAnalysis.h"
|
||||
#include "CallGraphWalker.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include <fstream>
|
||||
|
||||
|
@ -20,8 +21,6 @@ using namespace llvm;
|
|||
namespace opts {
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<unsigned> Verbosity;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
extern cl::opt<int> ThreadCount;
|
||||
|
||||
extern bool shouldProcess(const bolt::BinaryFunction &Function);
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Passes/IdenticalCodeFolding.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
#include "llvm/Support/ThreadPool.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
|
@ -26,8 +27,6 @@ using namespace bolt;
|
|||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<int> ThreadCount;
|
||||
extern cl::opt<int> NoThreads;
|
||||
|
||||
static cl::opt<bool>
|
||||
UseDFS("icf-dfs",
|
||||
|
|
|
@ -104,18 +104,6 @@ PerfDataA("p",
|
|||
cl::desc("Alias for -perfdata"),
|
||||
cl::aliasopt(PerfData),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
cl::opt<int>
|
||||
ThreadCount("thread-count",
|
||||
cl::desc("number of threads"),
|
||||
cl::init(hardware_concurrency()),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool>
|
||||
NoThreads("no-threads",
|
||||
cl::desc("disbale multithreading"),
|
||||
cl::init(false),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
} // namespace opts
|
||||
static StringRef ToolName;
|
||||
|
|
Loading…
Reference in New Issue