diff --git a/bolt/CacheMetrics.cpp b/bolt/CacheMetrics.cpp index bd723b80629d..638872dc67c2 100644 --- a/bolt/CacheMetrics.cpp +++ b/bolt/CacheMetrics.cpp @@ -8,26 +8,65 @@ //===----------------------------------------------------------------------===// #include "CacheMetrics.h" +#include "llvm/Support/Options.h" using namespace llvm; using namespace bolt; -using Traversal = std::vector; -// The weight of fallthrough jumps for ExtTSP metric -constexpr double FallthroughWeight = 1.0; -// The weight of forward jumps for ExtTSP metric -constexpr double ForwardWeight = 1.0; -// The weight of backward jumps for ExtTSP metric -constexpr double BackwardWeight = 1.0; -// The maximum distance (in bytes) of forward jumps for ExtTSP metric -constexpr uint64_t ForwardDistance = 256; -// The maximum distance (in bytes) of backward jumps for ExtTSP metric -constexpr uint64_t BackwardDistance = 256; +namespace opts { -// The size of the i-TLB cache page -constexpr uint64_t ITLBPageSize = 4096; -// Capacity of the i-TLB cache -constexpr uint64_t ITLBEntries = 16; +extern cl::OptionCategory BoltOptCategory; + +cl::opt +FallthroughWeight("fallthrough-weight", + cl::desc("The weight of forward jumps for ExtTSP metric"), + cl::init(1), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ForwardWeight("forward-weight", + cl::desc("The weight of forward jumps for ExtTSP metric"), + cl::init(0.4), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +BackwardWeight("backward-weight", + cl::desc("The weight of backward jumps for ExtTSP metric"), + cl::init(0.4), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ForwardDistance("forward-distance", + cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP metric"), + cl::init(768), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +BackwardDistance("backward-distance", + cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP metric"), + cl::init(192), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ITLBPageSize("itlb-page-size", + cl::desc("The size of i-tlb cache page"), + cl::init(4096), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ITLBEntries("itlb-entries", + cl::desc("The number of entries in i-tlb cache"), + cl::init(16), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} namespace { @@ -46,104 +85,6 @@ void extractBasicBlockInfo( } } -/// Initialize and return a vector of traversals for a given entry block -std::vector getTraversals(BinaryBasicBlock *EntryBB) { - std::vector AllTraversals; - std::stack> Stack; - Stack.push(std::make_pair(EntryBB, Traversal())); - std::unordered_set BBSet; - - while (!Stack.empty()) { - BinaryBasicBlock *CurrentBB = Stack.top().first; - Traversal PrevTraversal(Stack.top().second); - Stack.pop(); - - // Add current basic block into consideration - BBSet.insert(CurrentBB); - PrevTraversal.push_back(CurrentBB); - - if (CurrentBB->succ_empty()) { - AllTraversals.push_back(PrevTraversal); - continue; - } - - bool HaveSuccCount = false; - // Calculate total edges count of successors - for (auto BI = CurrentBB->branch_info_begin(); - BI != CurrentBB->branch_info_end(); ++BI) { - if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { - HaveSuccCount = true; - break; - } - } - if (!HaveSuccCount) { - AllTraversals.push_back(PrevTraversal); - continue; - } - - auto BI = CurrentBB->branch_info_begin(); - for (auto *SuccBB : CurrentBB->successors()) { - // If we have never seen SuccBB, or SuccBB indicates the - // end of traversal, SuccBB will be added into stack for - // further exploring. - if ((BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 && - BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) || - SuccBB->succ_empty()) { - Stack.push(std::make_pair(SuccBB, PrevTraversal)); - } - ++BI; - } - } - - return AllTraversals; -} - -/// Given a traversal, return the sum of block distances along this traversal. -double getTraversalLength( - const std::unordered_map &BBAddr, - const Traversal &Path) { - double Length = 0; - for (size_t I = 0; I + 1 < Path.size(); I++) { - // Ignore calls between hot and cold parts - if (Path[I]->isCold() != Path[I + 1]->isCold()) - continue; - double SrcAddr = BBAddr.at(Path[I]); - double DstAddr = BBAddr.at(Path[I + 1]); - Length += std::abs(SrcAddr - DstAddr); - } - return Length; -} - -/// Calculate average number of call distance for every graph traversal -double calcGraphDistance( - const std::vector &BinaryFunctions, - const std::unordered_map &BBAddr, - const std::unordered_map &BBSize) { - - double TotalTraversalLength = 0; - double NumTraversals = 0; - for (auto BF : BinaryFunctions) { - // Only consider functions which are known to be executed - if (BF->getKnownExecutionCount() == 0) - continue; - - for (auto BB : BF->layout()) { - if (BB->isEntryPoint()) { - auto AllTraversals = getTraversals(BB); - for (auto const &Path : AllTraversals) { - // Ignore short traversals - if (Path.size() <= 1) - continue; - TotalTraversalLength += getTraversalLength(BBAddr, Path); - NumTraversals++; - } - } - } - } - - return TotalTraversalLength / NumTraversals; -} - /// Calculate TSP metric, which quantifies the number of fallthrough jumps in /// the ordering of basic blocks double calcTSPScore( @@ -166,22 +107,12 @@ double calcTSPScore( return Score; } -/// Calculate Extended-TSP metric, which quantifies the expected number of -/// i-cache misses for a given ordering of basic blocks. The parameters are: -/// - FallthroughWeight is the impact of fallthrough jumps on the score -/// - ForwardWeight is the impact of forward (but not fallthrough) jumps -/// - BackwardWeight is the impact of backward jumps -/// - ForwardDistance is the max distance of a forward jump affecting the score -/// - BackwardDistance is the max distance of a backward jump affecting the score +/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache +/// misses for a given ordering of basic blocks double calcExtTSPScore( const std::vector &BinaryFunctions, const std::unordered_map &BBAddr, - const std::unordered_map &BBSize, - double FallthroughWeight, - double ForwardWeight, - double BackwardWeight, - uint64_t ForwardDistance, - uint64_t BackwardDistance) { + const std::unordered_map &BBSize) { double Score = 0.0; for (auto BF : BinaryFunctions) { @@ -189,33 +120,10 @@ double calcExtTSPScore( auto BI = SrcBB->branch_info_begin(); for (auto DstBB : SrcBB->successors()) { if (DstBB != SrcBB) { - double Count = BI->Count == BinaryBasicBlock::COUNT_NO_PROFILE - ? 0.0 - : double(BI->Count); - uint64_t SrcAddr = BBAddr.at(SrcBB); - uint64_t SrcSize = BBSize.at(SrcBB); - uint64_t DstAddr = BBAddr.at(DstBB); - - if (SrcAddr <= DstAddr) { - if (SrcAddr + SrcSize == DstAddr) { - // fallthrough jump - Score += FallthroughWeight * Count; - } else { - // the distance of the forward jump - size_t Dist = DstAddr - (SrcAddr + SrcSize); - if (Dist <= ForwardDistance) { - double Prob = double(ForwardDistance - Dist) / ForwardDistance; - Score += ForwardWeight * Prob * Count; - } - } - } else { - // the distance of the backward jump - size_t Dist = SrcAddr + SrcSize - DstAddr; - if (Dist <= BackwardDistance) { - double Prob = double(BackwardDistance - Dist) / BackwardDistance; - Score += BackwardWeight * Prob * Count; - } - } + Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), + BBSize.at(SrcBB), + BBAddr.at(DstBB), + BI->Count); } ++BI; } @@ -277,10 +185,10 @@ extractFunctionCalls(const std::vector &BinaryFunctions) { double expectedCacheHitRatio( const std::vector &BinaryFunctions, const std::unordered_map &BBAddr, - const std::unordered_map &BBSize, - double PageSize, - uint64_t CacheEntries) { + const std::unordered_map &BBSize) { + const double PageSize = opts::ITLBPageSize; + const uint64_t CacheEntries = opts::ITLBEntries; auto Calls = extractFunctionCalls(BinaryFunctions); // Compute 'hotness' of the functions double TotalSamples = 0; @@ -334,6 +242,34 @@ double expectedCacheHitRatio( return 100.0 * (1.0 - Misses / TotalSamples); } +} // end namespace anonymous + +double CacheMetrics::extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count) { + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); + + // Fallthrough + if (SrcAddr + SrcSize == DstAddr) { + return opts::FallthroughWeight * Count; + } + // Forward + if (SrcAddr + SrcSize < DstAddr) { + const auto Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= opts::ForwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::ForwardDistance; + return opts::ForwardWeight * Prob * Count; + } + return 0; + } + // Backward + const auto Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= opts::BackwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::BackwardDistance; + return opts::BackwardWeight * Prob * Count; + } + return 0; } void CacheMetrics::printAll( @@ -356,10 +292,10 @@ void CacheMetrics::printAll( } outs() << format(" There are %zu functions;", NumFunctions) - << format(" %zu (%.2lf%%) have non-empty execution count\n", + << format(" %zu (%.2lf%%) have positive execution count\n", NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions); outs() << format(" There are %zu basic blocks;", NumBlocks) - << format(" %zu (%.2lf%%) have non-empty execution count\n", + << format(" %zu (%.2lf%%) have positive execution count\n", NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks); std::unordered_map BBAddr; @@ -377,35 +313,14 @@ void CacheMetrics::printAll( outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu)\n", 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize); - outs() << " An average length of graph traversal: " - << format("%.0lf\n", calcGraphDistance(BinaryFunctions, - BBAddr, - BBSize)); - - outs() << " Expected i-TLB cache hit ratio " - << format("(%zu, %zu): ", ITLBPageSize, ITLBEntries) + outs() << " Expected i-TLB cache hit ratio: " << format("%.2lf%%\n", expectedCacheHitRatio(BinaryFunctions, BBAddr, - BBSize, - ITLBPageSize, - ITLBEntries)); + BBSize)); outs() << " TSP score: " << format("%.0lf\n", calcTSPScore(BinaryFunctions, BBAddr, BBSize)); - outs() << " ExtTSP score " - << format("(%.2lf, %.2lf, %.2lf, %zu, %zu): ", FallthroughWeight, - ForwardWeight, - BackwardWeight, - ForwardDistance, - BackwardDistance) - << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, - BBAddr, - BBSize, - FallthroughWeight, - ForwardWeight, - BackwardWeight, - ForwardDistance, - BackwardDistance)); - + outs() << " ExtTSP score: " + << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, BBAddr, BBSize)); } diff --git a/bolt/CacheMetrics.h b/bolt/CacheMetrics.h index 1dab4565bc34..b512168ebaf3 100644 --- a/bolt/CacheMetrics.h +++ b/bolt/CacheMetrics.h @@ -20,6 +20,17 @@ namespace CacheMetrics { /// Calculate various metrics related to instruction cache performance. void printAll(const std::vector &BinaryFunctions); +/// Calculate Extended-TSP metric, which quantifies the expected number of +/// i-cache misses for a given pair of basic blocks. The parameters are: +/// - SrcAddr is the address of the source block; +/// - SrcSize is the size of the source block; +/// - DstAddr is the address of the destination block; +/// - Count is the number of jumps between the pair of blocks. +double extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count); + } // namespace CacheMetrics } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 6f78bb856f2c..b90cc0f10c52 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -161,6 +161,9 @@ ReorderBlocks("reorder-blocks", "cache", "perform optimal layout prioritizing I-cache " "behavior"), + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, + "cache+", + "perform layout optimizing I-cache behavior"), clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, "cluster-shuffle", "perform random layout of clusters"), @@ -469,6 +472,10 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); break; + case LT_OPTIMIZE_CACHE_PLUS: + Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo))); + break; + case LT_OPTIMIZE_SHUFFLE: Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); break; diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index ea7376f7997b..0ef8e9027d55 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -169,6 +169,8 @@ public: /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) /// that suggests putting frequently executed chains first in the layout. LT_OPTIMIZE_CACHE, + /// Block reordering guided by the extended TSP metric. + LT_OPTIMIZE_CACHE_PLUS, /// Create clusters and use random order for them. LT_OPTIMIZE_SHUFFLE, }; diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index f9b8db8703af..0255e7b40048 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_library(LLVMBOLTPasses BinaryFunctionCallGraph.cpp CallGraph.cpp CallGraphWalker.cpp + CachePlusReorderAlgorithm.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp FrameAnalysis.cpp diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/Passes/CachePlusReorderAlgorithm.cpp new file mode 100644 index 000000000000..5a717aaec5e2 --- /dev/null +++ b/bolt/Passes/CachePlusReorderAlgorithm.cpp @@ -0,0 +1,476 @@ +//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "CacheMetrics.h" +#include "ReorderAlgorithm.h" +#include "ReorderUtils.h" + +using namespace llvm; +using namespace bolt; +using EdgeList = std::vector>; + +namespace llvm { +namespace bolt { + +namespace { + +// A cluster (ordered sequence) of basic blocks +class Cluster { +public: + Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_) + : Id(BB->getLayoutIndex()), + IsEntry(BB->getLayoutIndex() == 0), + ExecutionCount(ExecutionCount_), + Size(Size_), + Score(0) { + Blocks.push_back(BB); + } + + size_t id() const { + return Id; + } + + uint64_t size() const { + return Size; + } + + double density() const { + return static_cast(ExecutionCount) / Size; + } + + bool isCold() const { + return ExecutionCount == 0; + } + + uint64_t executionCount() const { + return ExecutionCount; + } + + bool isEntryPoint() const { + return IsEntry; + } + + double score() const { + return Score; + } + + const std::vector &blocks() const { + return Blocks; + } + + /// Update the list of basic blocks and meta-info + void merge(const Cluster *Other, + const std::vector &MergedBlocks, + double MergedScore) { + Blocks = MergedBlocks; + IsEntry |= Other->IsEntry; + ExecutionCount += Other->ExecutionCount; + Size += Other->Size; + Score = MergedScore; + } + +private: + std::vector Blocks; + size_t Id; + bool IsEntry; + uint64_t ExecutionCount; + uint64_t Size; + double Score; +}; + +/// Deterministically compare clusters by their density in decreasing order +bool compareClusters(const Cluster *C1, const Cluster *C2) { + // original entry point to the front + if (C1->isEntryPoint()) + return true; + if (C2->isEntryPoint()) + return false; + + const double D1 = C1->density(); + const double D2 = C2->density(); + if (D1 != D2) + return D1 > D2; + // Making the order deterministic + return C1->id() < C2->id(); +} + +/// Deterministically compare pairs of clusters +bool compareClusterPairs(const Cluster *A1, const Cluster *B1, + const Cluster *A2, const Cluster *B2) { + const auto Samples1 = A1->executionCount() + B1->executionCount(); + const auto Samples2 = A2->executionCount() + B2->executionCount(); + if (Samples1 != Samples2) + return Samples1 < Samples2; + + if (A1 != A2) + return A1->id() < A2->id(); + return B1->id() < B2->id(); +} + +} // end namespace anonymous + +/// CachePlus - layout of basic blocks with i-cache optimization. +/// +/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy +/// heuristic that works with clusters (ordered sequences) of basic blocks. +/// Initially all clusters are isolated basic blocks. On every iteration, +/// we pick a pair of clusters whose merging yields the biggest increase in +/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which +/// models how i-cache "friendly" a specific cluster is. A pair of clusters +/// giving the maximum gain is merged into a new cluster. The procedure stops +/// when there is only one cluster left, or when merging does not increase +/// ExtTSP. In the latter case, the remaining clusters are sorted by density. +/// +/// An important aspect is the way two clusters are merged. Unlike earlier +/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two +/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we +/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y, +/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. +/// This improves the quality of the final result (the search space is larger) +/// while keeping the implementation sufficiently fast. +class CachePlus { +public: + CachePlus(const BinaryFunction &BF) + : BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) { + initialize(); + } + + /// Run cache+ algorithm and return a basic block ordering + std::vector run() { + // Merge pairs of clusters while there is an improvement in ExtTSP metric + while (Clusters.size() > 1) { + Cluster *BestClusterPred = nullptr; + Cluster *BestClusterSucc = nullptr; + std::pair BestGain(-1, 0); + for (auto ClusterPred : Clusters) { + // Get candidates for merging with the current cluster + Adjacent.forAllAdjacent( + ClusterPred, + // Find the best candidate + [&](Cluster *ClusterSucc) { + assert(ClusterPred != ClusterSucc && "loop edges are not supported"); + // Do not merge cold blocks + if (ClusterPred->isCold() || ClusterSucc->isCold()) + return; + + // Compute the gain of merging two clusters + auto Gain = mergeGain(ClusterPred, ClusterSucc); + if (Gain.first <= 0.0) + return; + + // Breaking ties by density to make the hottest clusters be merged first + if (Gain.first > BestGain.first || + (std::abs(Gain.first - BestGain.first) < 1e-8 && + compareClusterPairs(ClusterPred, + ClusterSucc, + BestClusterPred, + BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } + }); + } + + // Stop merging when there is no improvement + if (BestGain.first <= 0.0) + break; + + // Merge the best pair of clusters + mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second); + } + + // Sorting clusters by density + std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); + + // Collect the basic blocks in the order specified by their clusters + std::vector Result; + Result.reserve(BF.layout_size()); + for (auto Cluster : Clusters) { + Result.insert(Result.end(), + Cluster->blocks().begin(), + Cluster->blocks().end()); + } + + return Result; + } + +private: + /// Initialize the set of active clusters, edges between blocks, and + /// adjacency matrix. + void initialize() { + // Initialize indices of basic blocks + size_t LayoutIndex = 0; + for (auto BB : BF.layout()) { + BB->setLayoutIndex(LayoutIndex); + LayoutIndex++; + } + + // Initialize edges for the blocks and compute their total in/out weights + OutEdges = std::vector(BF.layout_size()); + auto InWeight = std::vector(BF.layout_size(), 0); + auto OutWeight = std::vector(BF.layout_size(), 0); + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + for (auto I : BB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "missing profile for a jump"); + if (I != BB && BI->Count > 0) { + InWeight[I->getLayoutIndex()] += BI->Count; + OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count)); + OutWeight[BB->getLayoutIndex()] += BI->Count; + } + ++BI; + } + } + + // Initialize execution count for every basic block, which is the + // maximum over the sums of all in and out edge weights. + // Also execution count of the entry point is set to at least 1 + auto ExecutionCounts = std::vector(BF.layout_size(), 0); + for (auto BB : BF.layout()) { + uint64_t EC = BB->getKnownExecutionCount(); + EC = std::max(EC, InWeight[BB->getLayoutIndex()]); + EC = std::max(EC, OutWeight[BB->getLayoutIndex()]); + if (BB->getLayoutIndex() == 0) + EC = std::max(EC, uint64_t(1)); + ExecutionCounts[BB->getLayoutIndex()] = EC; + } + + // Initialize clusters + Clusters.reserve(BF.layout_size()); + AllClusters.reserve(BF.layout_size()); + Size.reserve(BF.layout_size()); + for (auto BB : BF.layout()) { + size_t Index = BB->getLayoutIndex(); + Size.push_back(std::max(BB->estimateSize(), size_t(1))); + AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]); + Clusters.push_back(&AllClusters[Index]); + } + + // Initialize adjacency matrix + Adjacent.initialize(Clusters); + for (auto BB : BF.layout()) { + for (auto I : BB->successors()) { + if (BB != I) + Adjacent.set(Clusters[BB->getLayoutIndex()], + Clusters[I->getLayoutIndex()]); + } + } + } + + /// Compute ExtTSP score for a given order of basic blocks + double score(const std::vector& Blocks) const { + uint64_t NotSet = static_cast(-1); + auto Addr = std::vector(BF.layout_size(), NotSet); + uint64_t CurAddr = 0; + for (auto BB : Blocks) { + size_t Index = BB->getLayoutIndex(); + Addr[Index] = CurAddr; + CurAddr += Size[Index]; + } + + double Score = 0; + for (auto BB : Blocks) { + size_t Index = BB->getLayoutIndex(); + for (auto Edge : OutEdges[Index]) { + auto SuccBB = Edge.first; + size_t SuccIndex = SuccBB->getLayoutIndex(); + + if (Addr[SuccBB->getLayoutIndex()] != NotSet) { + Score += CacheMetrics::extTSPScore(Addr[Index], + Size[Index], + Addr[SuccIndex], + Edge.second); + } + } + } + return Score; + } + + /// The gain of merging two clusters. + /// + /// The function considers all possible ways of merging two clusters and + /// computes the one having the largest increase in ExtTSP metric. The result + /// is a pair with the first element being the gain and the second element being + /// the corresponding merging type (encoded as an integer). + std::pair mergeGain(const Cluster *ClusterPred, + const Cluster *ClusterSucc) const { + if (Cache.contains(ClusterPred, ClusterSucc)) { + return Cache.get(ClusterPred, ClusterSucc); + } + + // The current score of two separate clusters + const auto CurScore = ClusterPred->score() + ClusterSucc->score(); + + // Merge two clusters and update the best Gain + auto computeMergeGain = [&](const std::pair &CurGain, + const Cluster *ClusterPred, + const Cluster *ClusterSucc, + size_t MergeType) { + auto MergedBlocks = mergeBlocks(ClusterPred->blocks(), + ClusterSucc->blocks(), + MergeType); + // Does the new cluster preserve the original entry point? + if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) && + MergedBlocks[0]->getLayoutIndex() != 0) + return CurGain; + + // The score of the new cluster + const auto NewScore = score(MergedBlocks); + if (NewScore > CurScore && NewScore - CurScore > CurGain.first) + return std::make_pair(NewScore - CurScore, MergeType); + else + return CurGain; + }; + + std::pair Gain = std::make_pair(-1, 0); + // Try to simply concatenate two clusters + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0); + // Try to split ClusterPred into two and merge with ClusterSucc + for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { + for (size_t Type = 0; Type < 4; Type++) { + size_t MergeType = 1 + Type + Offset * 4; + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); + } + } + + Cache.set(ClusterPred, ClusterSucc, Gain); + return Gain; + } + + /// Merge two clusters (orders) of blocks according to a given 'merge type'. + /// + /// If MergeType == 0, then the results is a concatentation of two clusters. + /// Otherwise, the first cluster is cut into two and we consider all possible + /// ways of concatenating three clusters. + std::vector mergeBlocks( + const std::vector &X, + const std::vector &Y, + size_t MergeType + ) const { + // Concatenate three clusters of blocks in the given order + auto concat = [&](const std::vector &A, + const std::vector &B, + const std::vector &C) { + std::vector Result; + Result.reserve(A.size() + B.size() + C.size()); + Result.insert(Result.end(), A.begin(), A.end()); + Result.insert(Result.end(), B.begin(), B.end()); + Result.insert(Result.end(), C.begin(), C.end()); + return Result; + }; + + // Merging w/o splitting existing clusters + if (MergeType == 0) { + return concat(X, Y, std::vector()); + } + + MergeType--; + size_t Type = MergeType % 4; + size_t Offset = MergeType / 4; + assert(0 < Offset && Offset < X.size() && + "Invalid offset while merging clusters"); + // Split the first cluster, X, into X1 and X2 + std::vector X1(X.begin(), X.begin() + Offset); + std::vector X2(X.begin() + Offset, X.end()); + + // Construct a new cluster from three existing ones + switch(Type) { + case 0: return concat(X1, Y, X2); + case 1: return concat(Y, X2, X1); + case 2: return concat(X2, Y, X1); + case 3: return concat(X2, X1, Y); + default: + llvm_unreachable("unexpected merge type"); + } + } + + /// Merge cluster From into cluster Into, update the list of active clusters, + /// adjacency information, and the corresponding cache. + void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) { + assert(Into != From && "Cluster cannot be merged with itself"); + // Merge the clusters + auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); + Into->merge(From, MergedBlocks, score(MergedBlocks)); + + // Remove cluster From from the list of active clusters + auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); + Clusters.erase(Iter, Clusters.end()); + + // Invalidate caches + Cache.invalidate(Into); + + // Update the adjacency matrix + Adjacent.merge(Into, From); + } + + // The binary function + const BinaryFunction &BF; + + // All clusters + std::vector AllClusters; + + // Active clusters. The vector gets udpated at runtime when clusters are merged + std::vector Clusters; + + // Size of the block + std::vector Size; + + // Outgoing edges of the block + std::vector OutEdges; + + // Cluster adjacency matrix + AdjacencyMatrix Adjacent; + + // A cache that keeps precomputed values of mergeGain for pairs of clusters; + // when a pair of clusters (x,y) gets merged, we invalidate the pairs + // containing both x and y and all clusters adjacent to x and y (and recompute + // them on the next iteration). + mutable ClusterPairCache> Cache; +}; + +void CachePlusReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Are there jumps with positive execution count? + uint64_t SumCount = 0; + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + for (auto I : BB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr); + SumCount += BI->Count; + ++BI; + } + } + + // Do not change layout of functions w/o profile information + if (SumCount == 0) { + for (auto BB : BF.layout()) { + Order.push_back(BB); + } + return; + } + + // Apply the algorithm + Order = CachePlus(BF).run(); + + // Verify correctness + assert(Order[0]->isEntryPoint() && "Original entry point is not preserved"); + assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout"); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index fb8f2cbcf2c2..4d15572110f4 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -29,11 +29,8 @@ #include "BinaryFunction.h" #include "HFSort.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" +#include "ReorderUtils.h" #include "llvm/Support/Options.h" -#include "llvm/Support/raw_ostream.h" #include #include @@ -48,21 +45,9 @@ using namespace bolt; namespace opts { extern cl::OptionCategory BoltOptCategory; -extern cl::opt Verbosity; -cl::opt -ITLBPageSizeParam("itlb-page-size", - cl::desc("The size of i-tlb cache page"), - cl::init(4096), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -cl::opt -ITLBEntriesParam("itlb-entries", - cl::desc("The number of entries in i-tlb cache"), - cl::init(16), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +extern cl::opt ITLBPageSize; +extern cl::opt ITLBEntries; cl::opt MergeProbability("merge-probability", @@ -92,189 +77,46 @@ int32_t ITLBPageSize; // while smaller values result in better i-cache performance int32_t ITLBEntries; -// This class maintains adjacency information for all Clusters being -// processed. It is used to invalidate cache entries when merging -// Clusters and for visiting all neighbors of any given Cluster. -class AdjacencyMatrix { - public: - AdjacencyMatrix(const CallGraph &Cg, - std::vector &Clusters, - const std::vector &FuncCluster) - : Clusters(Clusters), - Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) { - initialize(Cg, FuncCluster); - } - - template - void forallAdjacent(const Cluster *C, F Func) const { - const_cast(this)->forallAdjacent(C, Func); - } - - template - void forallAdjacent(const Cluster *C, F Func) { - for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) { - Func(Clusters[I]); - } - } - - // Merge adjacency info from cluster B into cluster A. Info for cluster B is left - // in an undefined state. - void merge(const Cluster *A, const Cluster *B) { - Bits[A->id()] |= Bits[B->id()]; - Bits[A->id()][A->id()] = false; - Bits[A->id()][B->id()] = false; - Bits[B->id()][A->id()] = false; - for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) { - Bits[I][A->id()] = true; - Bits[I][B->id()] = false; - } - } - - void dump(const Cluster *A) const { - outs() << "Cluster " << A->id() << ":"; - forallAdjacent(A, [](const Cluster *B) { outs() << " " << B->id(); }); - } - - void dump() const { - for (auto *A : Clusters) { - if (!A) continue; - dump(A); - outs() << "\n"; - } - } - private: - void set(const Cluster *A, const Cluster *B, bool Value) { - assert(A != B); - Bits[A->id()][B->id()] = Value; - Bits[B->id()][A->id()] = Value; - } - - void initialize(const CallGraph &Cg, const std::vector &FuncCluster) { - for (auto *A : Clusters) { - for (auto TargetId : A->targets()) { - for (auto Succ : Cg.successors(TargetId)) { - auto *B = FuncCluster[Succ]; - if (!B || B == A) continue; - const auto &Arc = *Cg.findArc(TargetId, Succ); - if (Arc.weight() <= 0.0) continue; - - set(A, B, true); - } - for (auto Pred : Cg.predecessors(TargetId)) { - auto *B = FuncCluster[Pred]; - if (!B || B == A) continue; - const auto &Arc = *Cg.findArc(Pred, TargetId); - if (Arc.weight() <= 0.0) continue; - - set(A, B, true); - } - } - } - } - - std::vector Clusters; - std::vector Bits; -}; - -// A cache of precomputed results for a pair of clusters -class PrecomputedResults { - public: - explicit PrecomputedResults(size_t Size) - : Size(Size), - Cache(new double[Size*Size]), - Valid(Size * Size, false) { - memset(Cache, 0, sizeof(double)*Size*Size); - } - ~PrecomputedResults() { - delete[] Cache; - } - - bool contains(const Cluster *First, const Cluster *Second) const { - return Valid[index(First, Second)]; - } - - double get(const Cluster *First, const Cluster *Second) const { - assert(contains(First, Second)); - return Cache[index(First, Second)]; - } - - void set(const Cluster *First, const Cluster *Second, double Value) { - const auto Index = index(First, Second); - Cache[Index] = Value; - Valid[Index] = true; - } - - void invalidate(const Cluster *C) { - Valid.reset(C->id() * Size, (C->id() + 1) * Size); - for (size_t Id = 0; Id < Size; Id++) { - Valid.reset(Id * Size + C->id()); - } - } - - private: - size_t index(const Cluster *First, const Cluster *Second) const { - return First->id() * Size + Second->id(); - } - - size_t Size; - double *Cache; - BitVector Valid; -}; - -/* - * Erase an element from a container if it is present. Otherwise, do nothing. - */ -template -void maybeErase(C &Container, const V& Value) { - auto Itr = Container.find(Value); - if (Itr != Container.end()) - Container.erase(Itr); -} - -/* - * Density of a cluster formed by merging a given pair of clusters - */ +/// Density of a cluster formed by merging a given pair of clusters. double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) { const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples(); const double CombinedSize = ClusterPred->size() + ClusterSucc->size(); return CombinedSamples / CombinedSize; } -/* - * Deterministically compare clusters by their density in decreasing order. - */ +/// Deterministically compare clusters by density in decreasing order. bool compareClusters(const Cluster *C1, const Cluster *C2) { const double D1 = C1->density(); const double D2 = C2->density(); - if (D1 != D2) return D1 > D2; + if (D1 != D2) + return D1 > D2; // making sure the sorting is deterministic - if (C1->size() != C2->size()) return C1->size() < C2->size(); - if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); + if (C1->size() != C2->size()) + return C1->size() < C2->size(); + if (C1->samples() != C2->samples()) + return C1->samples() > C2->samples(); return C1->target(0) < C2->target(0); } -/* - * Deterministically compare pairs of clusters by their density - * in decreasing order. - */ +/// Deterministically compare pairs of clusters by density in decreasing order. bool compareClusterPairs(const Cluster *A1, const Cluster *B1, const Cluster *A2, const Cluster *B2) { const auto D1 = density(A1, B1); const auto D2 = density(A2, B2); - if (D1 != D2) return D1 > D2; - // making sure the sorting is deterministic + if (D1 != D2) + return D1 > D2; const auto Size1 = A1->size() + B1->size(); const auto Size2 = A2->size() + B2->size(); - if (Size1 != Size2) return Size1 < Size2; + if (Size1 != Size2) + return Size1 < Size2; const auto Samples1 = A1->samples() + B1->samples(); const auto Samples2 = A2->samples() + B2->samples(); - if (Samples1 != Samples2) return Samples1 > Samples2; + if (Samples1 != Samples2) + return Samples1 > Samples2; return A1->target(0) < A2->target(0); } -/* - * Sorting clusters by their density in decreasing order - */ +/// Sorting clusters by their density in decreasing order. template std::vector sortByDensity(const C &Clusters_) { std::vector Clusters(Clusters_.begin(), Clusters_.end()); @@ -282,27 +124,23 @@ std::vector sortByDensity(const C &Clusters_) { return Clusters; } -/* - * HFSortPlus - layout of hot functions with iTLB cache optimization - * - * Given an ordering of hot functions (and hence, their assignment to the - * iTLB pages), we can divide all functions calls into two categories: - * - 'short' ones that have a caller-callee distance less than a page; - * - 'long' ones where the distance exceeds a page. - * The short calls are likely to result in a iTLB cache hit. For the long ones, - * the hit/miss result depends on the 'hotness' of the page (i.e., how often - * the page is accessed). Assuming that functions are sent to the iTLB cache - * in a random order, the probability that a page is present in the cache is - * proportional to the number of samples corresponding to the functions on the - * page. The following algorithm detects short and long calls, and optimizes - * the expected number of cache misses for the long ones. - */ +/// HFSortPlus - layout of hot functions with iTLB cache optimization +/// +/// Given an ordering of hot functions (and hence, their assignment to the +/// iTLB pages), we can divide all functions calls into two categories: +/// - 'short' ones that have a caller-callee distance less than a page; +/// - 'long' ones where the distance exceeds a page. +/// The short calls are likely to result in a iTLB cache hit. For the long ones, +/// the hit/miss result depends on the 'hotness' of the page (i.e., how often +/// the page is accessed). Assuming that functions are sent to the iTLB cache +/// in a random order, the probability that a page is present in the cache is +/// proportional to the number of samples corresponding to the functions on the +/// page. The following algorithm detects short and long calls, and optimizes +/// the expected number of cache misses for the long ones. class HFSortPlus { public: - /* - * The expected number of calls on different i-TLB pages for an arc of the - * call graph with a specified weight - */ + /// The expected number of calls on different i-TLB pages for an arc of the + /// call graph with a specified weight double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double Weight) const { const auto Dist = std::abs(SrcAddr - DstAddr); if (Dist > ITLBPageSize) @@ -313,15 +151,13 @@ public: return (1.0 - X * X) * Weight; } - /* - * The probability that a page with a given weight is not present in the cache - * - * Assume that the hot functions are called in a random order; then the - * probability of a i-TLB page being accessed after a function call is - * p=pageSamples/totalSamples. The probability that the page is not accessed - * is (1-p), and the probability that it is not in the cache (i.e. not accessed - * during the last ITLBEntries function calls) is (1-p)^ITLBEntries - */ + /// The probability that a page with a given weight is not present in the cache + /// + /// Assume that the hot functions are called in a random order; then the + /// probability of a i-TLB page being accessed after a function call is + /// p=pageSamples/totalSamples. The probability that the page is not accessed + /// is (1-p), and the probability that it is not in the cache (i.e. not accessed + /// during the last ITLBEntries function calls) is (1-p)^ITLBEntries double missProbability(double PageSamples) const { double P = PageSamples / TotalSamples; double X = ITLBEntries; @@ -330,10 +166,8 @@ public: return pow(1.0 - P, X); } - /* - * The expected number of calls within a given cluster with both endpoints on - * the same cache page - */ + /// The expected number of calls within a given cluster with both endpoints on + /// the same cache page double shortCalls(const Cluster *Cluster) const { double Calls = 0; for (auto TargetId : Cluster->targets()) { @@ -352,10 +186,8 @@ public: return Calls; } - /* - * The number of calls between the two clusters with both endpoints on - * the same i-TLB page, assuming that a given pair of clusters gets merged - */ + /// The number of calls between the two clusters with both endpoints on + /// the same i-TLB page, assuming that a given pair of clusters gets merged double shortCalls(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { double Calls = 0; @@ -389,18 +221,16 @@ public: return Calls; } - /* - * The gain of merging two clusters. - * - * We assume that the final clusters are sorted by their density, and hence - * every cluster is likely to be adjacent with clusters of the same density. - * Thus, the 'hotness' of every cluster can be estimated by density*pageSize, - * which is used to compute the probability of cache misses for long calls - * of a given cluster. - * The result is also scaled by the size of the resulting cluster in order to - * increse the chance of merging short clusters, which is helpful for - * the i-cache performance. - */ + /// The gain of merging two clusters. + /// + /// We assume that the final clusters are sorted by their density, and hence + /// every cluster is likely to be adjacent with clusters of the same density. + /// Thus, the 'hotness' of every cluster can be estimated by density*pageSize, + /// which is used to compute the probability of cache misses for long calls + /// of a given cluster. + /// The result is also scaled by the size of the resulting cluster in order to + /// increse the chance of merging short clusters, which is helpful for + /// the i-cache performance. double mergeGain(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) { @@ -435,9 +265,7 @@ public: return Gain; } - /* - * For every active cluster, compute its total weight of outgoing edges - */ + /// For every active cluster, compute its total weight of outgoing edges std::unordered_map computeOutgoingWeight() { std::unordered_map OutWeight; for (auto ClusterPred : Clusters) { @@ -456,9 +284,7 @@ public: return OutWeight; } - /* - * Find pairs of clusters that call each other with high probability - */ + /// Find pairs of clusters that call each other with high probability std::vector> findClustersToMerge() { // compute total weight of outgoing edges for every cluster auto OutWeight = computeOutgoingWeight(); @@ -503,10 +329,8 @@ public: return PairsToMerge; } - /* - * Run the first optimization pass of the hfsort+ algorithm: - * Merge clusters that call each other with high probability - */ + /// Run the first optimization pass of the hfsort+ algorithm: + /// Merge clusters that call each other with high probability void runPassOne() { while (Clusters.size() > 1) { // pairs of clusters that will be merged on this iteration @@ -523,11 +347,9 @@ public: } } - /* - * Run the second optimization pass of the hfsort+ algorithm: - * Merge pairs of clusters while there is an improvement in the - * expected cache miss ratio - */ + /// Run the second optimization pass of the hfsort+ algorithm: + /// Merge pairs of clusters while there is an improvement in the + /// expected cache miss ratio void runPassTwo() { while (Clusters.size() > 1) { Cluster *BestClusterPred = nullptr; @@ -535,7 +357,7 @@ public: double BestGain = -1; for (auto ClusterPred : Clusters) { // get candidates for merging with the current cluster - Adjacent.forallAdjacent( + Adjacent.forAllAdjacent( ClusterPred, // find the best candidate [&](Cluster *ClusterSucc) { @@ -565,9 +387,7 @@ public: } } - /* - * Run hfsort+ algorithm and return ordered set of function clusters. - */ + /// Run hfsort+ algorithm and return ordered set of function clusters. std::vector run() { DEBUG(dbgs() << "Starting hfsort+ w/" << (UseGainCache ? "gain cache" : "no cache") @@ -602,19 +422,37 @@ public: Addr(Cg.numNodes(), InvalidAddr), TotalSamples(0.0), Clusters(initializeClusters()), - Adjacent(Cg, Clusters, FuncCluster), + Adjacent(Cg.numNodes()), UseGainCache(UseGainCache), GainCache(Clusters.size()) { + // Initialize adjacency matrix + Adjacent.initialize(Clusters); + for (auto *A : Clusters) { + for (auto TargetId : A->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + auto *B = FuncCluster[Succ]; + if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(TargetId, Succ); + if (Arc.weight() > 0.0) + Adjacent.set(A, B); + } + for (auto Pred : Cg.predecessors(TargetId)) { + auto *B = FuncCluster[Pred]; + if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(Pred, TargetId); + if (Arc.weight() > 0.0) + Adjacent.set(A, B); + } + } + } } -private: - /* - * Initialize the set of active clusters, function id to cluster mapping, - * total number of samples and function addresses. - */ +private: + /// Initialize the set of active clusters, function id to cluster mapping, + /// total number of samples and function addresses. std::vector initializeClusters() { - ITLBPageSize = opts::ITLBPageSizeParam; - ITLBEntries = opts::ITLBEntriesParam; + ITLBPageSize = opts::ITLBPageSize; + ITLBEntries = opts::ITLBEntries; // Initialize clusters std::vector Clusters; @@ -632,16 +470,8 @@ private: return Clusters; } - /* - * Merge cluster From into cluster Into and update the list of active clusters - */ + /// Merge cluster From into cluster Into and update the list of active clusters void mergeClusters(Cluster *Into, Cluster *From) { - DEBUG( - if (opts::Verbosity > 0) { - dbgs() << "Merging cluster " << From->id() - << " into cluster " << Into->id() << "\n"; - }); - // The adjacency merge must happen before the Cluster::merge since that // clobbers the contents of From. Adjacent.merge(Into, From); @@ -690,7 +520,7 @@ private: std::vector Clusters; // Cluster adjacency matrix - AdjacencyMatrix Adjacent; + AdjacencyMatrix Adjacent; // Use cache for mergeGain results bool UseGainCache; @@ -699,10 +529,10 @@ private: // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs // containing both x and y and all clusters adjacent to x and y (and recompute // them on the next iteration). - mutable PrecomputedResults GainCache; + mutable ClusterPairCache GainCache; }; -} +} // end namespace anonymous std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache) { // It is required that the sum of incoming arc weights is not greater diff --git a/bolt/Passes/ReorderAlgorithm.h b/bolt/Passes/ReorderAlgorithm.h index fd50a6c311e5..5be8a93f6f1f 100644 --- a/bolt/Passes/ReorderAlgorithm.h +++ b/bolt/Passes/ReorderAlgorithm.h @@ -243,6 +243,16 @@ public: const BinaryFunction &BF, BasicBlockOrder &Order) const override; }; +/// A new reordering algorithm for basic blocks, cache+ +class CachePlusReorderAlgorithm : public ReorderAlgorithm { +public: + explicit CachePlusReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; /// Toy example that simply reverses the original basic block order. class ReverseReorderAlgorithm : public ReorderAlgorithm { diff --git a/bolt/Passes/ReorderUtils.h b/bolt/Passes/ReorderUtils.h new file mode 100644 index 000000000000..15c68d65e635 --- /dev/null +++ b/bolt/Passes/ReorderUtils.h @@ -0,0 +1,112 @@ +// Passes/ReorderUtils.h - Helper methods for function and block reordering // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H + +#include +#include + +#include "llvm/ADT/BitVector.h" + +namespace llvm { +namespace bolt { + +// This class maintains adjacency information for all Clusters being +// processed. It is used for visiting all neighbors of any given Cluster +// while merging pairs of Clusters. Every Cluster must implement the id() method +template class AdjacencyMatrix { +public: + explicit AdjacencyMatrix(size_t Size) : Bits(Size, BitVector(Size, false)) {} + + void initialize(std::vector &_Clusters) { Clusters = _Clusters; } + + template void forAllAdjacent(const Cluster *C, F Func) const { + const_cast(this)->forallAdjacent(C, Func); + } + + template void forAllAdjacent(const Cluster *C, F Func) { + for (auto I = Bits[C->id()].find_first(); I != -1; + I = Bits[C->id()].find_next(I)) { + Func(Clusters[I]); + } + } + + /// Merge adjacency info from cluster B into cluster A. Info for cluster B is + /// left in an undefined state. + void merge(const Cluster *A, const Cluster *B) { + Bits[A->id()] |= Bits[B->id()]; + Bits[A->id()][A->id()] = false; + Bits[A->id()][B->id()] = false; + Bits[B->id()][A->id()] = false; + for (auto I = Bits[B->id()].find_first(); I != -1; + I = Bits[B->id()].find_next(I)) { + Bits[I][A->id()] = true; + Bits[I][B->id()] = false; + } + } + + void set(const Cluster *A, const Cluster *B) { set(A, B, true); } + +private: + void set(const Cluster *A, const Cluster *B, bool Value) { + assert(A != B); + Bits[A->id()][B->id()] = Value; + Bits[B->id()][A->id()] = Value; + } + + std::vector Clusters; + std::vector Bits; +}; + +// This class holds cached results of specified type for a pair of Clusters. +// It can invalidate all cache entries associated with a given Cluster. +template class ClusterPairCache { +public: + explicit ClusterPairCache(size_t Size) + : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {} + + bool contains(const Cluster *First, const Cluster *Second) const { + return Valid[index(First, Second)]; + } + + ValueType get(const Cluster *First, const Cluster *Second) const { + assert(contains(First, Second)); + return Cache[index(First, Second)]; + } + + void set(const Cluster *First, const Cluster *Second, ValueType Value) { + const auto Index = index(First, Second); + Cache[Index] = Value; + Valid[Index] = true; + } + + void invalidate(const Cluster *C) { + Valid.reset(C->id() * Size, (C->id() + 1) * Size); + for (size_t id = 0; id < Size; id++) { + Valid.reset((id * Size) + C->id()); + } + } + +private: + size_t index(const Cluster *First, const Cluster *Second) const { + return (First->id() * Size) + Second->id(); + } + + size_t Size; + std::vector Cache; + BitVector Valid; +}; + +} // namespace bolt +} // namespace llvm + +#endif