Create a general interface to implement parallel tasks easily and apply it to run EliminateUnreachableBlocks in parallel.

Summary:
Each time we run some work in parallel over the list of functions in bolt, we manage a thread pool, task scheduling and perform some work to manage the granularity of the tasks based on the type of the work we do.

In this task, I am creating an interface where all those details are abstracted out, the user provides the function that will run on each  function, and some policy parameters that setup the scheduling and granularity configurations.

This will make it easier to implement parallel tasks, and eliminate redundant coding efforts.

(cherry picked from FBD16116077)
This commit is contained in:
laith sakka 2019-07-03 17:23:19 -07:00 committed by Maksim Panchenko
parent f10d1fe0f3
commit 3cfc76cdbf
8 changed files with 222 additions and 26 deletions

View File

@ -83,6 +83,7 @@ add_llvm_tool(llvm-bolt
Heatmap.cpp
JumpTable.cpp
MCPlusBuilder.cpp
ParallelUtilities.cpp
ProfileReader.cpp
ProfileWriter.cpp
Relocation.cpp

View File

@ -0,0 +1,139 @@
//===--- ParallelUtilities.cpp -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "ParallelUtilities.h"
#include "llvm/Support/Timer.h"
#include <mutex>
#include <shared_mutex>
#define DEBUG_TYPE "par-utils"
namespace opts {
extern cl::OptionCategory BoltCategory;
cl::opt<unsigned>
ThreadCount("thread-count",
cl::desc("number of threads"),
cl::init(hardware_concurrency()),
cl::cat(BoltCategory));
cl::opt<bool>
NoThreads("no-threads",
cl::desc("disable multithreading"),
cl::init(false),
cl::cat(BoltCategory));
cl::opt<unsigned>
TaskCount("tasks-per-thread",
cl::desc("number of tasks to be created per thread"),
cl::init(20),
cl::cat(BoltCategory));
}
namespace {
/// A single thread pool that is used to run parallel tasks
std::unique_ptr<ThreadPool> ThPoolPtr;
} // namespace
namespace llvm {
namespace bolt {
namespace ParallelUtilities {
ThreadPool &getThreadPool() {
if (ThPoolPtr.get())
return *ThPoolPtr;
ThPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
return *ThPoolPtr;
}
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName, unsigned TasksPerThread) {
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
Timer T(LogName, LogName);
DEBUG(T.startTimer());
for (auto It = BlockBegin; It != BlockEnd; ++It) {
auto &BF = It->second;
if (SkipPredicate && SkipPredicate(BF))
continue;
WorkFunction(BF);
}
DEBUG(T.stopTimer());
};
if (opts::NoThreads) {
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
return;
}
// Estimate the overall runtime cost using the scheduling policy
unsigned TotalCost = 0;
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL) {
TotalCost = BC.getBinaryFunctions().size();
} else {
for (auto &BFI : BC.getBinaryFunctions()) {
auto &BF = BFI.second;
if (SkipPredicate && SkipPredicate(BF))
continue;
if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
TotalCost++;
else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
TotalCost += BF.size();
else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
TotalCost += BF.size() * BF.size();
}
}
// Divide work into blocks of equal cost
ThreadPool &ThPool = getThreadPool();
const unsigned BlockCost = TotalCost / BlocksCount;
auto BlockBegin = BC.getBinaryFunctions().begin();
unsigned CurrentCost = 0;
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It) {
auto &BF = It->second;
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
CurrentCost++;
else {
if (SkipPredicate && SkipPredicate(BF))
continue;
if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
CurrentCost++;
else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
CurrentCost += BF.size();
else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
CurrentCost += BF.size() * BF.size();
}
if (CurrentCost >= BlockCost) {
ThPool.async(runBlock, BlockBegin, std::next(It));
BlockBegin = std::next(It);
CurrentCost = 0;
}
}
ThPool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
ThPool.wait();
}
} // namespace ParallelUtilities
} // namespace bolt
} // namespace llvm

View File

@ -0,0 +1,59 @@
//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This class creates an interface that can be used to run parallel tasks that
// operate on functions. Several scheduling criteria are supported using
// SchedulingPolicy, and are defined by how the runtime cost should be
// estimated.
// If the NoThreads flags is passed, work will execute sequentially.
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
#include "llvm/Support/ThreadPool.h"
#include "BinaryContext.h"
#include "BinaryFunction.h"
using namespace llvm;
namespace opts {
extern cl::opt<unsigned> ThreadCount;
extern cl::opt<bool> NoThreads;
extern cl::opt<unsigned> TaskCount;
}
namespace llvm {
namespace bolt {
namespace ParallelUtilities {
using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
enum SchedulingPolicy {
SP_TRIVIAL, /// cost is estimated by the number of functions
SP_CONSTANT, /// cost is estimated by the number of non-skipped functions
SP_LINEAR, /// cost is estimated by the size of non-skipped functions
SP_QUADRATIC /// cost is estimated by the square of the size of non-skipped
/// functions
};
/// Return the managed threadpool and initialize it if not intiliazed
ThreadPool &getThreadPool();
// Perform the work on each binary function, except those that are accepted
// by the SkipPredicate, scheduling heuristic is based on SchedPolicy
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncTy WorkFunction,
PredicateTy SkipPredicate = PredicateTy(),
std::string LogName = "",
unsigned TasksPerThread = opts::TaskCount);
} // namespace ParallelUtilities
} // namespace bolt
} // namespace llvm
#endif

View File

@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "BinaryPasses.h"
#include "ParallelUtilities.h"
#include "Passes/ReorderAlgorithm.h"
#include "llvm/Support/Options.h"
#include <numeric>
@ -294,7 +295,10 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
DeletedBlocks += Count;
DeletedBytes += Bytes;
if (Count) {
Modified.insert(&Function);
{
std::unique_lock<std::shared_timed_mutex> Lock(ModifiedMtx);
Modified.insert(&Function);
}
if (opts::Verbosity > 0) {
outs() << "BOLT-INFO: Removed " << Count
<< " dead basic block(s) accounting for " << Bytes
@ -305,12 +309,18 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
}
void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
for (auto &It : BC.getBinaryFunctions()) {
auto &Function = It.second;
if (shouldOptimize(Function)) {
runOnFunction(Function);
}
}
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
runOnFunction(BF);
};
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !shouldOptimize(BF);
};
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
"EliminateUnreachableBlocks");
outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
<< DeletedBytes << " bytes of code.\n";
}

View File

@ -19,7 +19,7 @@
#include "DynoStats.h"
#include "HFSort.h"
#include "llvm/Support/CommandLine.h"
#include <atomic>
#include <map>
#include <set>
#include <string>
@ -95,9 +95,10 @@ public:
/// Detect and eliminate unreachable basic blocks. We could have those
/// filled with nops and they are used for alignment.
class EliminateUnreachableBlocks : public BinaryFunctionPass {
std::shared_timed_mutex ModifiedMtx;
std::unordered_set<const BinaryFunction *> Modified;
unsigned DeletedBlocks{0};
uint64_t DeletedBytes{0};
std::atomic<unsigned> DeletedBlocks{0};
std::atomic<uint64_t> DeletedBytes{0};
void runOnFunction(BinaryFunction& Function);
public:
EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)

View File

@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "FrameAnalysis.h"
#include "CallGraphWalker.h"
#include "ParallelUtilities.h"
#include "llvm/Support/ThreadPool.h"
#include <fstream>
@ -20,8 +21,6 @@ using namespace llvm;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<unsigned> Verbosity;
extern cl::opt<bool> NoThreads;
extern cl::opt<int> ThreadCount;
extern bool shouldProcess(const bolt::BinaryFunction &Function);

View File

@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "Passes/IdenticalCodeFolding.h"
#include "ParallelUtilities.h"
#include "llvm/Support/Options.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/Timer.h"
@ -26,8 +27,6 @@ using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<int> ThreadCount;
extern cl::opt<int> NoThreads;
static cl::opt<bool>
UseDFS("icf-dfs",

View File

@ -104,18 +104,6 @@ PerfDataA("p",
cl::desc("Alias for -perfdata"),
cl::aliasopt(PerfData),
cl::cat(AggregatorCategory));
cl::opt<int>
ThreadCount("thread-count",
cl::desc("number of threads"),
cl::init(hardware_concurrency()),
cl::cat(BoltCategory));
cl::opt<bool>
NoThreads("no-threads",
cl::desc("disbale multithreading"),
cl::init(false),
cl::cat(BoltCategory));
} // namespace opts
static StringRef ToolName;