forked from OSchip/llvm-project
Run reorder blocks in parallel
Summary: This diff change reorderBasicBlocks pass to run in parallel, it does so by adding locks to the fix branches function, and creating temporary MCCodeEmitters when estimating basic block code size. (cherry picked from FBD16161149)
This commit is contained in:
parent
1169f1fdd8
commit
9977b03fea
|
@ -12,6 +12,7 @@
|
|||
#include "BinaryBasicBlock.h"
|
||||
#include "BinaryContext.h"
|
||||
#include "BinaryFunction.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/MC/MCAsmInfo.h"
|
||||
#include "llvm/MC/MCContext.h"
|
||||
|
@ -455,6 +456,7 @@ void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) {
|
|||
assert(isSuccessor(Successor));
|
||||
auto &BC = Function->getBinaryContext();
|
||||
MCInst NewInst;
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get());
|
||||
Instructions.emplace_back(std::move(NewInst));
|
||||
}
|
||||
|
@ -537,8 +539,8 @@ void BinaryBasicBlock::dump() const {
|
|||
outs() << "\n";
|
||||
}
|
||||
|
||||
uint64_t BinaryBasicBlock::estimateSize() const {
|
||||
return Function->getBinaryContext().computeCodeSize(begin(), end());
|
||||
uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
|
||||
return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter);
|
||||
}
|
||||
|
||||
BinaryBasicBlock::BinaryBranchInfo &
|
||||
|
|
|
@ -16,14 +16,15 @@
|
|||
|
||||
#include "llvm/ADT/GraphTraits.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/MC/MCCodeEmitter.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCSymbol.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ErrorOr.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
|
@ -870,8 +871,11 @@ public:
|
|||
return InputRange.second - InputRange.first;
|
||||
}
|
||||
|
||||
/// Returns an estimate of size of basic block during run time.
|
||||
uint64_t estimateSize() const;
|
||||
/// Returns an estimate of size of basic block during run time optionally
|
||||
/// using a user-supplied emitter for lock-free multi-thread work.
|
||||
/// MCCodeEmitter is not thread safe and each thread should operate with its
|
||||
/// own copy of it.
|
||||
uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const;
|
||||
|
||||
/// Return index in the current layout. The user is responsible for
|
||||
/// making sure the indices are up to date,
|
||||
|
|
|
@ -1615,26 +1615,23 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF) {
|
|||
BF.fixBranches();
|
||||
|
||||
// Create local MC context to isolate the effect of ephemeral code emission.
|
||||
std::unique_ptr<MCObjectFileInfo> LocalMOFI =
|
||||
llvm::make_unique<MCObjectFileInfo>();
|
||||
std::unique_ptr<MCContext> LocalCtx =
|
||||
llvm::make_unique<MCContext>(AsmInfo.get(), MRI.get(), LocalMOFI.get());
|
||||
LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false, *LocalCtx);
|
||||
auto MCEInstance = createIndependentMCCodeEmitter();
|
||||
auto *LocalCtx = MCEInstance.LocalCtx.get();
|
||||
auto *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions());
|
||||
auto *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *LocalCtx);
|
||||
|
||||
SmallString<256> Code;
|
||||
raw_svector_ostream VecOS(Code);
|
||||
|
||||
std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
|
||||
*TheTriple, *LocalCtx, std::unique_ptr<MCAsmBackend>(MAB), VecOS,
|
||||
std::unique_ptr<MCCodeEmitter>(MCE), *STI,
|
||||
std::unique_ptr<MCCodeEmitter>(MCEInstance.MCE.release()), *STI,
|
||||
/* RelaxAll */ false,
|
||||
/* IncrementalLinkerCompatible */ false,
|
||||
/* DWARFMustBeAtTheEnd */ false));
|
||||
|
||||
Streamer->InitSections(false);
|
||||
|
||||
auto *Section = LocalMOFI->getTextSection();
|
||||
auto *Section = MCEInstance.LocalMOFI->getTextSection();
|
||||
Section->setHasInstructions(true);
|
||||
|
||||
auto *StartLabel = LocalCtx->getOrCreateSymbol("__hstart");
|
||||
|
|
|
@ -915,26 +915,33 @@ public:
|
|||
/// size is for the cold one.
|
||||
std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF);
|
||||
|
||||
/// Calculate the size of the instruction \p Inst.
|
||||
uint64_t computeInstructionSize(const MCInst &Inst) const {
|
||||
/// Calculate the size of the instruction \p Inst optionally using a
|
||||
/// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is
|
||||
/// not thread safe and each thread should operate with its own copy of it.
|
||||
uint64_t
|
||||
computeInstructionSize(const MCInst &Inst,
|
||||
const MCCodeEmitter *Emitter = nullptr) const {
|
||||
if (!Emitter)
|
||||
Emitter = this->MCE.get();
|
||||
SmallString<256> Code;
|
||||
SmallVector<MCFixup, 4> Fixups;
|
||||
raw_svector_ostream VecOS(Code);
|
||||
MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
|
||||
|
||||
Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI);
|
||||
return Code.size();
|
||||
}
|
||||
|
||||
/// Compute the native code size for a range of instructions.
|
||||
/// Note: this can be imprecise wrt the final binary since happening prior to
|
||||
/// relaxation, as well as wrt the original binary because of opcode
|
||||
/// shortening.
|
||||
/// shortening.MCCodeEmitter is not thread safe and each thread should operate
|
||||
/// with its own copy of it.
|
||||
template <typename Itr>
|
||||
uint64_t computeCodeSize(Itr Beg, Itr End) const {
|
||||
uint64_t computeCodeSize(Itr Beg, Itr End,
|
||||
const MCCodeEmitter *Emitter = nullptr) const {
|
||||
uint64_t Size = 0;
|
||||
while (Beg != End) {
|
||||
if (!MII->get(Beg->getOpcode()).isPseudo())
|
||||
Size += computeInstructionSize(*Beg);
|
||||
Size += computeInstructionSize(*Beg, Emitter);
|
||||
++Beg;
|
||||
}
|
||||
return Size;
|
||||
|
@ -999,6 +1006,30 @@ public:
|
|||
|
||||
void exitWithBugReport(StringRef Message,
|
||||
const BinaryFunction &Function) const;
|
||||
|
||||
struct IndependentCodeEmitter {
|
||||
std::unique_ptr<MCObjectFileInfo> LocalMOFI;
|
||||
std::unique_ptr<MCContext> LocalCtx;
|
||||
std::unique_ptr<MCCodeEmitter> MCE;
|
||||
};
|
||||
|
||||
/// Encapsulates an independent MCCodeEmitter that doesn't share resources
|
||||
/// with the main one available through BinaryContext::MCE, managed by
|
||||
/// BinaryContext.
|
||||
/// This is intended to create a lock-free environment for an auxiliary thread
|
||||
/// that needs to perform work with an MCCodeEmitter that can be transient or
|
||||
/// won't be used in the main code emitter.
|
||||
IndependentCodeEmitter createIndependentMCCodeEmitter() const {
|
||||
IndependentCodeEmitter MCEInstance;
|
||||
MCEInstance.LocalMOFI = llvm::make_unique<MCObjectFileInfo>();
|
||||
MCEInstance.LocalCtx = llvm::make_unique<MCContext>(
|
||||
AsmInfo.get(), MRI.get(), MCEInstance.LocalMOFI.get());
|
||||
MCEInstance.LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false,
|
||||
*MCEInstance.LocalCtx);
|
||||
MCEInstance.MCE.reset(
|
||||
TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
|
||||
return MCEInstance;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
|
|
|
@ -2634,7 +2634,7 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart,
|
|||
bool EmitCodeOnly, bool LabelsForOffsets) {
|
||||
if (!EmitCodeOnly && EmitColdPart && hasConstantIsland())
|
||||
duplicateConstantIslands();
|
||||
|
||||
|
||||
// Track first emitted instruction with debug info.
|
||||
bool FirstInstr = true;
|
||||
for (auto BB : layout()) {
|
||||
|
@ -2699,7 +2699,9 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart,
|
|||
|
||||
// Prepare to tag this location with a label if we need to keep track of
|
||||
// the location of calls/returns for BOLT address translation maps
|
||||
if (LabelsForOffsets && BC.MIB->hasAnnotation(Instr, "Offset")) {
|
||||
if (!EmitCodeOnly && LabelsForOffsets &&
|
||||
BC.MIB->hasAnnotation(Instr, "Offset")) {
|
||||
|
||||
MCSymbol *LocSym = BC.Ctx->createTempSymbol(/*CanBeUnnamed=*/true);
|
||||
Streamer.EmitLabel(LocSym);
|
||||
BC.MIB->addAnnotation(Instr, "LocSym",
|
||||
|
@ -2708,11 +2710,11 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart,
|
|||
}
|
||||
|
||||
// Emit SDT labels
|
||||
if (BC.MIB->hasAnnotation(Instr, "SDTMarker")) {
|
||||
if (!EmitCodeOnly && BC.MIB->hasAnnotation(Instr, "SDTMarker")) {
|
||||
auto OriginalAddress =
|
||||
BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "SDTMarker").get();
|
||||
auto *SDTLabel = BC.SDTMarkers[OriginalAddress].Label;
|
||||
|
||||
|
||||
// A given symbol should only be emitted as a label once
|
||||
if (SDTLabel->isUndefined())
|
||||
Streamer.EmitLabel(SDTLabel);
|
||||
|
@ -3263,9 +3265,13 @@ void BinaryFunction::fixBranches() {
|
|||
const auto *FSuccessor = BB->getConditionalSuccessor(false);
|
||||
if (NextBB && NextBB == TSuccessor) {
|
||||
std::swap(TSuccessor, FSuccessor);
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
}
|
||||
BB->swapConditionalSuccessors();
|
||||
} else {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
MIB->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
}
|
||||
if (TSuccessor == FSuccessor) {
|
||||
|
@ -3279,7 +3285,11 @@ void BinaryFunction::fixBranches() {
|
|||
TSuccessor->isCold() != FSuccessor->isCold() &&
|
||||
BB->isCold() != TSuccessor->isCold()) {
|
||||
std::swap(TSuccessor, FSuccessor);
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(),
|
||||
Ctx);
|
||||
}
|
||||
BB->swapConditionalSuccessors();
|
||||
}
|
||||
BB->addBranchInstruction(FSuccessor);
|
||||
|
@ -3849,13 +3859,13 @@ SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc,
|
|||
// Always emit is_stmt at the beginning of function fragment.
|
||||
if (FirstInstr)
|
||||
Flags |= DWARF2_FLAG_IS_STMT;
|
||||
|
||||
|
||||
BC.Ctx->setCurrentDwarfLoc(
|
||||
CurrentFilenum,
|
||||
CurrentRow.Line,
|
||||
CurrentRow.Line,
|
||||
CurrentRow.Column,
|
||||
Flags,
|
||||
CurrentRow.Isa,
|
||||
Flags,
|
||||
CurrentRow.Isa,
|
||||
CurrentRow.Discriminator);
|
||||
BC.Ctx->setDwarfCompileUnitID(FunctionUnitIndex);
|
||||
|
||||
|
|
|
@ -20,13 +20,13 @@
|
|||
namespace opts {
|
||||
extern cl::OptionCategory BoltCategory;
|
||||
|
||||
cl::opt<unsigned>
|
||||
cl::opt<unsigned>
|
||||
ThreadCount("thread-count",
|
||||
cl::desc("number of threads"),
|
||||
cl::init(hardware_concurrency()),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool>
|
||||
cl::opt<bool>
|
||||
NoThreads("no-threads",
|
||||
cl::desc("disable multithreading"),
|
||||
cl::init(false),
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "ParallelUtilities.h"
|
||||
#include "Passes/ReorderAlgorithm.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
|
@ -335,31 +336,32 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
|
|||
return;
|
||||
|
||||
IsAArch64 = BC.isAArch64();
|
||||
std::atomic<uint64_t> ModifiedFuncCount{0};
|
||||
|
||||
uint64_t ModifiedFuncCount = 0;
|
||||
for (auto &It : BC.getBinaryFunctions()) {
|
||||
auto &Function = It.second;
|
||||
|
||||
if (!shouldOptimize(Function))
|
||||
continue;
|
||||
|
||||
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
|
||||
const bool ShouldSplit =
|
||||
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
|
||||
(opts::SplitFunctions == BinaryFunction::ST_EH &&
|
||||
Function.hasEHRanges()) ||
|
||||
Function.shouldSplit();
|
||||
modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters,
|
||||
(opts::SplitFunctions == BinaryFunction::ST_ALL) ||
|
||||
(opts::SplitFunctions == BinaryFunction::ST_EH && BF.hasEHRanges()) ||
|
||||
BF.shouldSplit();
|
||||
modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters,
|
||||
ShouldSplit);
|
||||
|
||||
if (Function.hasLayoutChanged()) {
|
||||
if (BF.hasLayoutChanged()) {
|
||||
++ModifiedFuncCount;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
|
||||
return !shouldOptimize(BF);
|
||||
};
|
||||
|
||||
ParallelUtilities::runOnEachFunction(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_LINEAR, WorkFun, SkipFunc,
|
||||
"ReorderBasicBlocks");
|
||||
|
||||
outs() << "BOLT-INFO: basic block reordering modified layout of "
|
||||
<< format("%zu (%.2lf%%) functions\n",
|
||||
ModifiedFuncCount,
|
||||
100.0 * ModifiedFuncCount / BC.getBinaryFunctions().size());
|
||||
<< format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
|
||||
100.0 * ModifiedFuncCount.load() /
|
||||
BC.getBinaryFunctions().size());
|
||||
|
||||
if (opts::PrintFuncStat > 0) {
|
||||
raw_ostream &OS = outs();
|
||||
|
@ -373,8 +375,8 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
|
|||
|
||||
OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
|
||||
OS << " There are " << BFs.size() << " functions in total. \n";
|
||||
OS << " Number of functions being modified: " << ModifiedFuncCount
|
||||
<< "\n";
|
||||
OS << " Number of functions being modified: "
|
||||
<< ModifiedFuncCount.load() << "\n";
|
||||
OS << " User asks for detailed information on top "
|
||||
<< opts::PrintFuncStat << " functions. (Ranked by function score)"
|
||||
<< "\n\n";
|
||||
|
|
|
@ -23,6 +23,7 @@ using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
|
|||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
|
||||
cl::opt<unsigned>
|
||||
ClusterSplitThreshold("cluster-split-threshold",
|
||||
|
@ -288,6 +289,12 @@ private:
|
|||
ExecutionCounts[BB->getLayoutIndex()] = EC;
|
||||
}
|
||||
|
||||
// Create a separate MCCodeEmitter to allow lock-free execution
|
||||
BinaryContext::IndependentCodeEmitter Emitter;
|
||||
if (!opts::NoThreads) {
|
||||
Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
|
||||
}
|
||||
|
||||
// Initialize clusters
|
||||
Clusters.reserve(BF.layout_size());
|
||||
AllClusters.reserve(BF.layout_size());
|
||||
|
@ -295,7 +302,8 @@ private:
|
|||
Size.reserve(BF.layout_size());
|
||||
for (auto BB : BF.layout()) {
|
||||
size_t Index = BB->getLayoutIndex();
|
||||
Size.push_back(std::max<uint64_t>(BB->estimateSize(), 1));
|
||||
Size.push_back(
|
||||
std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
|
||||
AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
|
||||
Clusters.push_back(&AllClusters[Index]);
|
||||
CurCluster.push_back(&AllClusters[Index]);
|
||||
|
|
|
@ -27,6 +27,7 @@ using namespace bolt;
|
|||
namespace opts {
|
||||
|
||||
extern cl::OptionCategory BoltOptCategory;
|
||||
extern cl::opt<bool> NoThreads;
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintClusters("print-clusters",
|
||||
|
@ -65,7 +66,13 @@ struct HashPair {
|
|||
|
||||
}
|
||||
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency() {
|
||||
void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
|
||||
// Create a separate MCCodeEmitter to allow lock-free execution
|
||||
BinaryContext::IndependentCodeEmitter Emitter;
|
||||
if (!opts::NoThreads) {
|
||||
Emitter = BC.createIndependentMCCodeEmitter();
|
||||
}
|
||||
|
||||
AvgFreq.resize(Clusters.size(), 0.0);
|
||||
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
|
||||
double Freq = 0.0;
|
||||
|
@ -75,7 +82,7 @@ void ClusterAlgorithm::computeClusterAverageFrequency() {
|
|||
Freq += BB->getExecutionCount();
|
||||
// Estimate the size of a block in bytes at run time
|
||||
// NOTE: This might be inaccurate
|
||||
ClusterSize += BB->estimateSize();
|
||||
ClusterSize += BB->estimateSize(Emitter.MCE.get());
|
||||
}
|
||||
}
|
||||
AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
|
||||
|
@ -525,7 +532,7 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
|
|||
auto &ClusterEdges = CAlgo->ClusterEdges;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
|
@ -621,13 +628,13 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
|
|||
const BinaryFunction &BF, BasicBlockOrder &Order) const {
|
||||
if (BF.layout_empty())
|
||||
return;
|
||||
|
||||
|
||||
// Cluster basic blocks.
|
||||
CAlgo->clusterBasicBlocks(BF);
|
||||
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
|
||||
|
||||
// Compute clusters' average frequencies.
|
||||
CAlgo->computeClusterAverageFrequency();
|
||||
CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
|
||||
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
|
||||
|
||||
if (opts::PrintClusters)
|
||||
|
|
|
@ -53,7 +53,7 @@ public:
|
|||
/// the sum of average frequencies of its blocks (execution count / # instrs).
|
||||
/// The average frequencies are stored in the AvgFreq vector, index by the
|
||||
/// cluster indices in the Clusters vector.
|
||||
void computeClusterAverageFrequency();
|
||||
void computeClusterAverageFrequency(const BinaryContext &BC);
|
||||
|
||||
/// Clear clusters and related info.
|
||||
virtual void reset();
|
||||
|
|
Loading…
Reference in New Issue