From bdf21f7617e4155efbf3df7ce277901ca3825ac9 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 7 Nov 2017 15:42:28 -0800 Subject: [PATCH] [BOLT] Align basic blocks based on execution count Summary: The default is not changing, i.e. we are not aligning code within a function by default. New meaning of options for aligning basic blocks: -align-blocks triggers basic block alignment based on profile -preserve-blocks-alignment tries to preserve basic block alignment seen on input Tuning options for "-align-blocks": -align-blocks-min-size= blocks smaller than the specified size wouldn't be aligned -align-blocks-threshold= align only blocks with frequency larger than containing function execution frequency specified in percent. E.g. 1000 means aligning blocks that are 10 times more frequently executed than the containing function. (cherry picked from FBD7921980) --- bolt/src/BinaryBasicBlock.h | 19 +++++-- bolt/src/BinaryFunction.cpp | 24 ++++++--- bolt/src/Passes/Aligner.cpp | 101 +++++++++++++++++++++++++++++++++--- bolt/src/Passes/Aligner.h | 13 ++++- 4 files changed, 141 insertions(+), 16 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index cf3b6dd27d63..946de333093e 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -80,7 +80,10 @@ private: std::pair InputRange{INVALID_OFFSET, INVALID_OFFSET}; /// Alignment requirements for the block. - uint64_t Alignment{1}; + uint32_t Alignment{1}; + + /// Maximum number of bytes to use for alignment of the block. + uint32_t AlignmentMaxBytes{0}; /// Number of times this basic block was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -499,15 +502,25 @@ public: } /// Set minimum alignment for the basic block. - void setAlignment(uint64_t Align) { + void setAlignment(uint32_t Align) { Alignment = Align; } /// Return required alignment for the block. - uint64_t getAlignment() const { + uint32_t getAlignment() const { return Alignment; } + /// Set the maximum number of bytes to use for the block alignment. + void setAlignmentMaxBytes(uint32_t Value) { + AlignmentMaxBytes = Value; + } + + /// Return the maximum number of bytes to use for the block alignment. + uint32_t getAlignmentMaxBytes() const { + return AlignmentMaxBytes; + } + /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index a127b8e3aea2..dd8ef1eae0c6 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -55,9 +55,10 @@ extern bool shouldProcess(const BinaryFunction &); extern cl::opt UpdateDebugSections; extern cl::opt Verbosity; -static cl::opt +cl::opt AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), + cl::desc("align basic blocks"), + cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -74,6 +75,13 @@ AlignMacroOpFusion("align-macro-fusion", cl::ZeroOrMore, cl::cat(BoltRelocCategory)); +cl::opt +PreserveBlocksAlignment("preserve-blocks-alignment", + cl::desc("try to preserve basic block alignment"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt DotToolTipCode("dot-tooltip-code", cl::desc("add basic block instructions as tool tips on nodes"), @@ -1603,7 +1611,7 @@ bool BinaryFunction::buildCFG() { // Always create new BB at branch destination. PrevBB = InsertBB; InsertBB = addBasicBlock(LI->first, LI->second, - /* DeriveAlignment = */ IsLastInstrNop); + opts::PreserveBlocksAlignment && IsLastInstrNop); if (hasEntryPointAtOffset(Offset)) InsertBB->setEntryPoint(); if (PrevBB) @@ -1631,7 +1639,8 @@ bool BinaryFunction::buildCFG() { } else { InsertBB = addBasicBlock(Offset, BC.Ctx->createTempSymbol("FT", true), - /* DeriveAlignment = */ IsLastInstrNop); + opts::PreserveBlocksAlignment && + IsLastInstrNop); updateOffset(LastInstrOffset); } } @@ -2195,8 +2204,11 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { if (EmitColdPart != BB->isCold()) continue; - if (opts::AlignBlocks && BB->getAlignment() > 1) - Streamer.EmitCodeAlignment(BB->getAlignment()); + if ((opts::AlignBlocks || opts::PreserveBlocksAlignment) + && BB->getAlignment() > 1) { + Streamer.EmitCodeAlignment(BB->getAlignment(), + BB->getAlignmentMaxBytes()); + } Streamer.EmitLabel(BB->getLabel()); // Check if special alignment for macro-fusion is needed. diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp index 0d56e9ffcfea..c4d4434f5b30 100644 --- a/bolt/src/Passes/Aligner.cpp +++ b/bolt/src/Passes/Aligner.cpp @@ -11,16 +11,34 @@ #include "Aligner.h" +#define DEBUG_TYPE "bolt-aligner" + using namespace llvm; namespace opts { + extern cl::OptionCategory BoltOptCategory; -cl::opt -UseCompactAligner("use-compact-aligner", - cl::desc("Use compact approach for aligning functions"), - cl::init(false), +extern cl::opt AlignBlocks; +extern cl::opt PreserveBlocksAlignment; + +cl::opt +AlignBlocksMinSize("align-blocks-min-size", + cl::desc("minimal size of the basic block that should be aligned"), + cl::init(0), cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +cl::opt +AlignBlocksThreshold("align-blocks-threshold", + cl::desc("align only blocks with frequency larger than containing function " + "execution frequency specified in percent. E.g. 1000 means aligning " + "blocks that are 10 times more frequently executed than the " + "containing function."), + cl::init(800), + cl::ZeroOrMore, + cl::Hidden, cl::cat(BoltOptCategory)); cl::opt @@ -37,6 +55,20 @@ AlignFunctionsMaxBytes("align-functions-max-bytes", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +BlockAlignment("block-alignment", + cl::desc("boundary to use for alignment of basic blocks"), + cl::init(16), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +UseCompactAligner("use-compact-aligner", + cl::desc("Use compact approach for aligning functions"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // end namespace opts namespace llvm { @@ -56,9 +88,11 @@ void alignMaxBytes(BinaryFunction &Function) { // the fuction by not more than the minimum over // -- the size of the function // -- the specified number of bytes -void alignCompact(BinaryContext &BC, BinaryFunction &Function) { +void alignCompact(BinaryFunction &Function) { + const auto &BC = Function.getBinaryContext(); size_t HotSize = 0; size_t ColdSize = 0; + for (const auto *BB : Function.layout()) { if (BB->isCold()) ColdSize += BC.computeCodeSize(BB->begin(), BB->end()); @@ -80,19 +114,74 @@ void alignCompact(BinaryContext &BC, BinaryFunction &Function) { } // end anonymous namespace +void AlignerPass::alignBlocks(BinaryFunction &Function) { + if (!Function.hasValidProfile() || !Function.isSimple()) + return; + + const auto &BC = Function.getBinaryContext(); + + const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount()); + BinaryBasicBlock *PrevBB{nullptr}; + for (auto *BB : Function.layout()) { + auto Count = BB->getKnownExecutionCount(); + + if (Count <= FuncCount * opts::AlignBlocksThreshold / 100) { + PrevBB = BB; + continue; + } + + uint64_t FTCount = 0; + if (PrevBB && PrevBB->getFallthrough() == BB) { + FTCount = PrevBB->getBranchInfo(*BB).Count; + } + PrevBB = BB; + + if (Count < FTCount * 2) + continue; + + const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end()); + const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize); + + if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize) + continue; + + BB->setAlignment(opts::BlockAlignment); + BB->setAlignmentMaxBytes(BytesToUse); + + // Update stats. + AlignHistogram[BytesToUse]++; + AlignedBlocksCount += BB->getKnownExecutionCount(); + } +} + void AlignerPass::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { if (!BC.HasRelocations) return; + AlignHistogram.resize(opts::BlockAlignment); + for (auto &It : BFs) { auto &Function = It.second; + if (opts::UseCompactAligner) - alignCompact(BC, Function); + alignCompact(Function); else alignMaxBytes(Function); + + if (opts::AlignBlocks && !opts::PreserveBlocksAlignment) + alignBlocks(Function); } + + DEBUG( + dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n"; + for (unsigned I = 1; I < AlignHistogram.size(); ++I) { + dbgs() << " " << I << " : " << AlignHistogram[I] << '\n'; + } + dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: " + << AlignedBlocksCount << '\n'; + ); } } // end namespace bolt diff --git a/bolt/src/Passes/Aligner.h b/bolt/src/Passes/Aligner.h index 3164a47a91c8..28e6f6d693b0 100644 --- a/bolt/src/Passes/Aligner.h +++ b/bolt/src/Passes/Aligner.h @@ -18,7 +18,18 @@ namespace llvm { namespace bolt { class AlignerPass : public BinaryFunctionPass { - public: +private: + + /// Stats for usage of max bytes for basic block alignment. + std::vector AlignHistogram; + + /// Stats: execution count of blocks that were aligned. + uint64_t AlignedBlocksCount{0}; + + /// Assign alignment to basic blocks based on profile. + void alignBlocks(BinaryFunction &Function); + +public: explicit AlignerPass() : BinaryFunctionPass(false) {} const char *getName() const override {