Run buildCFG in disassembly in parallel

Summary: This diff parallelize the construction of call graph during disassembly. The diff includes a change to parallel-utilities where another interface is added, that support running tasks on binaryFunctions that involves adding instruction annotations. This pattern is common in different places, e.g. frame optimizations. And such, pattern justify creating an interface, that abstract out all the messy details. (cherry picked from FBD16232809)
2019-07-12 07:25:50 -07:00 · 2019-07-12 07:25:50 -07:00 · 7d42835418
parent f4ab6e6924
commit 7d42835418
10 changed files with 300 additions and 132 deletions
--- a/bolt/src/BinaryFunction.cpp
+++ b/bolt/src/BinaryFunction.cpp
@ -141,7 +141,7 @@ PrintOnly("print-only",
  cl::Hidden,
  cl::cat(BoltCategory));

-static cl::opt<bool>
+cl::opt<bool>
 TimeBuild("time-build",
  cl::desc("print time spent constructing binary functions"),
  cl::ZeroOrMore,
@ -176,8 +176,6 @@ namespace llvm {
 namespace bolt {

 constexpr unsigned BinaryFunction::MinAlign;
-const char BinaryFunction::TimerGroupName[] = "buildfuncs";
-const char BinaryFunction::TimerGroupDesc[] = "Build Binary Functions";

 namespace {

@ -887,15 +885,18 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
  if (MCSymbol *IslandSym = getOrCreateIslandAccess(Address)) {
    return IslandSym;
  }
-
-  MCSymbol *Result = BC.Ctx->createTempSymbol();
+  MCSymbol *Result;
+  {
+    std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+    Result = BC.Ctx->createTempSymbol();
+  }
  Labels[Offset] = Result;
  return Result;
 }

 void BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
-  NamedRegionTimer T("disassemble", "Disassemble function", TimerGroupName,
-                     TimerGroupDesc, opts::TimeBuild);
+  NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs",
+                     "Build Binary Functions", opts::TimeBuild);

  assert(FunctionData.size() == getSize() &&
         "function size does not match raw data size");
@ -1449,8 +1450,8 @@ void BinaryFunction::postProcessJumpTables() {
  TakenBranches.erase(NewEnd, TakenBranches.end());
 }

-bool BinaryFunction::postProcessIndirectBranches() {
-
+bool BinaryFunction::postProcessIndirectBranches(
+    MCPlusBuilder::AllocatorIdTy AllocId) {
  auto addUnknownControlFlow = [&](BinaryBasicBlock &BB) {
    HasUnknownControlFlow = true;
    BB.removeAllSuccessors();
@ -1572,7 +1573,7 @@ bool BinaryFunction::postProcessIndirectBranches() {
  // fail to match the pattern.
  if (HasUnknownControlFlow && NumIndirectJumps == 1 &&
      JumpTables.size() == 1 && LastIndirectJump) {
-    BC.MIB->setJumpTable(*LastIndirectJump, LastJT, LastJTIndexReg);
+    BC.MIB->setJumpTable(*LastIndirectJump, LastJT, LastJTIndexReg, AllocId);
    HasUnknownControlFlow = false;

    // re-populate successors based on the jump table.
@ -1624,9 +1625,7 @@ void BinaryFunction::recomputeLandingPads() {
  }
 }

-bool BinaryFunction::buildCFG() {
-  NamedRegionTimer T("buildcfg", "Build CFG", TimerGroupName, TimerGroupDesc,
-                     opts::TimeBuild);
+bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
  auto &MIB = BC.MIB;

  if (!isSimple()) {
@ -1677,7 +1676,8 @@ bool BinaryFunction::buildCFG() {
    assert(PrevBB && PrevBB != InsertBB && "invalid previous block");
    auto *PrevInstr = PrevBB->getLastNonPseudoInstr();
    if (PrevInstr && !MIB->hasAnnotation(*PrevInstr, "Offset"))
-      MIB->addAnnotation(*PrevInstr, "Offset", static_cast<uint32_t>(Offset));
+      MIB->addAnnotation(*PrevInstr, "Offset", static_cast<uint32_t>(Offset),
+                         AllocatorId);
  };

  for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) {
@ -1705,9 +1705,17 @@ bool BinaryFunction::buildCFG() {
      DEBUG(dbgs() << "SDTMarker detected in the input at : "
                   << utohexstr(InstrInputAddr) << "\n");

-      MIB->addAnnotation<uint64_t>(Instr, "SDTMarker", InstrInputAddr);
-      BC.SDTMarkers[InstrInputAddr].Label =
-          getOrCreateLocalLabel(InstrInputAddr);
+      MIB->addAnnotation<uint64_t>(Instr, "SDTMarker", InstrInputAddr,
+                                   AllocatorId);
+
+      // This mutex is used to lock concurrent writes to GlobalSymbols and
+      // BinaryDataMap that happens in registerNameAtAddress
+      {
+        static std::shared_timed_mutex GlobalSymbolCreationMtx;
+        std::unique_lock<std::shared_timed_mutex> Lock(GlobalSymbolCreationMtx);
+        BC.SDTMarkers[InstrInputAddr].Label =
+            getOrCreateLocalLabel(InstrInputAddr);
+      }
    }

    // Ignore nops except SDT markers. We use nops to derive alignment of the
@ -1730,10 +1738,13 @@ bool BinaryFunction::buildCFG() {
        // Temporarily restore inserter basic block.
        InsertBB = PrevBB;
      } else {
-        InsertBB = addBasicBlock(Offset,
-                                 BC.Ctx->createTempSymbol("FT", true),
-                                 opts::PreserveBlocksAlignment &&
-                                   IsLastInstrNop);
+        MCSymbol *Label;
+        {
+          std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+          Label = BC.Ctx->createTempSymbol("FT", true);
+        }
+        InsertBB = addBasicBlock(
+            Offset, Label, opts::PreserveBlocksAlignment && IsLastInstrNop);
        updateOffset(LastInstrOffset);
      }
    }
@ -1835,7 +1846,7 @@ bool BinaryFunction::buildCFG() {
    DEBUG(dbgs() << "last block was marked as a fall-through in " << *this
                 << '\n');
  }
-
+  
  // Assign landing pads and throwers info.
  recomputeLandingPads();

@ -1843,7 +1854,7 @@ bool BinaryFunction::buildCFG() {
  annotateCFIState();

  // Annotate invoke instructions with GNU_args_size data.
-  propagateGnuArgsSizeInfo();
+  propagateGnuArgsSizeInfo(AllocatorId);

  // Set the basic block layout to the original order and set end offsets.
  PrevBB = nullptr;
@ -1871,7 +1882,7 @@ bool BinaryFunction::buildCFG() {
  CurrentState = State::CFG;

  // Make any necessary adjustments for indirect branches.
-  if (!postProcessIndirectBranches()) {
+  if (!postProcessIndirectBranches(AllocatorId)) {
    if (opts::Verbosity) {
      errs() << "BOLT-WARNING: failed to post-process indirect branches for "
             << *this << '\n';
@ -3303,7 +3314,8 @@ void BinaryFunction::fixBranches() {
         && "Invalid CFG detected after fixing branches");
 }

-void BinaryFunction::propagateGnuArgsSizeInfo() {
+void BinaryFunction::propagateGnuArgsSizeInfo(
+    MCPlusBuilder::AllocatorIdTy AllocId) {
  assert(CurrentState == State::Disassembled && "unexpected function state");

  if (!hasEHRanges() || !usesGnuArgsSize())
@ -3329,7 +3341,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo() {
        }
      } else if (BC.MIB->isInvoke(Instr)) {
        // Add the value of GNU_args_size as an extra operand to invokes.
-        BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize);
+        BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId);
      }
      ++II;
    }
--- a/bolt/src/BinaryFunction.h
+++ b/bolt/src/BinaryFunction.h
@ -367,7 +367,7 @@ private:

  /// Associate DW_CFA_GNU_args_size info with invoke instructions
  /// (call instructions with non-empty landing pad).
-  void propagateGnuArgsSizeInfo();
+  void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId);

  /// Synchronize branch instructions with CFG.
  void postProcessBranches();
@ -791,7 +791,6 @@ public:
    return LocSyms[Idx];
  }

-
  /// Update layout of basic blocks used for output.
  void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
    BasicBlocksPreviousLayout = BasicBlocksLayout;
@ -1358,6 +1357,7 @@ public:
                   bool DeriveAlignment = false) {
    assert(BC.Ctx && "cannot be called with empty context");
    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      Label = BC.Ctx->createTempSymbol("BB", true);
    }
    auto BB = std::unique_ptr<BinaryBasicBlock>(
@ -1384,9 +1384,10 @@ public:
    assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) &&
           "basic block already exists in pre-CFG state");

-    if (!Label)
+    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
      Label = BC.Ctx->createTempSymbol("BB", true);
-
+    }
    auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment);
    BasicBlocks.emplace_back(BBPtr.release());

@ -1837,6 +1838,7 @@ public:

    // Register our island at global namespace
    Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat");
+    
    // Internal bookkeeping
    const auto Offset = Address - getAddress();
    assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) &&
@ -1963,7 +1965,7 @@ public:
  ///
  /// Returns true on success and update the current function state to
  /// State::CFG. Returns false if CFG cannot be built.
-  bool buildCFG();
+  bool buildCFG(MCPlusBuilder::AllocatorIdTy);

  /// Read any kind of profile information available for the function.
  void readProfile();
@ -1984,7 +1986,7 @@ public:
  ///
  /// Return true upon successful processing, or false if the control flow
  /// cannot be statically evaluated for any given indirect branch.
-  bool postProcessIndirectBranches();
+  bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId);

  /// In functions with multiple entry points, the profile collection records
  /// data for other entry points in a different function entry. This function
--- a/bolt/src/MCPlusBuilder.cpp
+++ b/bolt/src/MCPlusBuilder.cpp
@ -148,12 +148,13 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
  return *Value;
 }

-void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) {
+void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                                   AllocatorIdTy AllocId) {
  assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
  assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
  assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");

-  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize);
+  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
 }

 uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
@ -168,11 +169,11 @@ uint16_t MCPlusBuilder::getJumpTableIndexReg(const MCInst &Inst) const {
 }

 bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
-                                 uint16_t IndexReg) {
+                                 uint16_t IndexReg, AllocatorIdTy AllocId) {
  if (!isIndirectBranch(Inst))
    return false;
-  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value);
-  getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg") = IndexReg;
+  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
+  addAnnotation<>(Inst, "JTIndexReg", IndexReg, AllocId);
  return true;
 }

@ -249,10 +250,10 @@ MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const {
    const auto Imm = AnnotationInst->getOperand(I).getImm();
    const auto Index = extractAnnotationIndex(Imm);
    const auto Value = extractAnnotationValue(Imm);
-    const auto *Annotation = 
+    const auto *Annotation =
            reinterpret_cast<const MCAnnotation *>(Value);
    if (Index >= MCAnnotation::kGeneric) {
-      OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric] 
+      OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric]
         << ": ";
      Annotation->print(OS);
    }
@ -427,7 +428,7 @@ bool MCPlusBuilder::evaluateBranch(const MCInst &Inst, uint64_t Addr,
      return AliasMap[Reg];
    return AliasMap[SuperReg[Reg]];
  }
- 
+
 uint8_t
 MCPlusBuilder::getRegSize(MCPhysReg Reg) const {
    // SizeMap caches a mapping of registers to their sizes
--- a/bolt/src/MCPlusBuilder.h
+++ b/bolt/src/MCPlusBuilder.h
@ -73,13 +73,6 @@ private:
  /// A variable that is used to generate unique ids for annotation allocators
  AllocatorIdTy MaxAllocatorId = 0;

-  /// Return the annotation allocator of a given id
-  AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
-    assert(AnnotationAllocators.count(AllocatorId) &&
-           "allocator not initialized");
-    return AnnotationAllocators.find(AllocatorId)->second;
-  }
-
  /// We encode Index and Value into a 64-bit immediate operand value.
  static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
    assert(Index < 256 && "annotation index max value exceeded");
@ -296,18 +289,30 @@ public:
  MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
                const MCRegisterInfo *RegInfo)
      : Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
-    // Initialize the default annotation allocator with id 0.
+    // Initialize the default annotation allocator with id 0
    AnnotationAllocators.emplace(0, AnnotationAllocator());
    MaxAllocatorId++;
  }

-  /// Initialize a new annotation allocator and return its id.
+  /// Initialize a new annotation allocator and return its id
  AllocatorIdTy initializeNewAnnotationAllocator() {
    AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
    return MaxAllocatorId++;
  }

-  /// Free the values allocator within the annotation allocator.
+  /// Return the annotation allocator of a given id
+  AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
+    assert(AnnotationAllocators.count(AllocatorId) &&
+           "allocator not initialized");
+    return AnnotationAllocators.find(AllocatorId)->second;
+  }
+
+  // Check if an annotation allocator with the given id exists
+  bool checkAllocatorExists(AllocatorIdTy AllocatorId) {
+    return AnnotationAllocators.count(AllocatorId);
+  }
+
+  /// Free the values allocator within the annotation allocator
  void freeValuesAllocator(AllocatorIdTy AllocatorId) {
    auto &Allocator = getAnnotationAllocator(AllocatorId);
    for (auto *Annotation : Allocator.AnnotationPool)
@ -321,7 +326,7 @@ public:
    freeAnnotations();
  }

-  /// Free all memory allocated for annotations.
+  /// Free all memory allocated for annotations
  void freeAnnotations() {
    for (auto &Element : AnnotationAllocators) {
      auto &Allocator = Element.second;
@ -1016,7 +1021,8 @@ public:
  int64_t getGnuArgsSize(const MCInst &Inst) const;

  /// Add the value of GNU_args_size to Inst if it already has EH info.
-  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize);
+  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                      AllocatorIdTy AllocId = 0);

  /// Return jump table addressed by this instruction.
  uint64_t getJumpTable(const MCInst &Inst) const;
@ -1025,7 +1031,8 @@ public:
  uint16_t getJumpTableIndexReg(const MCInst &Inst) const;

  /// Set jump table addressed by this instruction.
-  bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg);
+  bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg,
+                    AllocatorIdTy AllocId = 0);

  /// Disassociate instruction with a jump table.
  bool unsetJumpTable(MCInst &Inst);
--- a/bolt/src/ParallelUtilities.cpp
+++ b/bolt/src/ParallelUtilities.cpp
@ -16,7 +16,6 @@

 #define DEBUG_TYPE "par-utils"

-
 namespace opts {
 extern cl::OptionCategory BoltCategory;

@ -32,34 +31,87 @@ NoThreads("no-threads",
  cl::init(false),
  cl::cat(BoltCategory));

-cl::opt<unsigned> 
+cl::opt<unsigned>
 TaskCount("tasks-per-thread",
  cl::desc("number of tasks to be created per thread"),
  cl::init(20),
  cl::cat(BoltCategory));

-}
-
-namespace {
-/// A single thread pool that is used to run parallel tasks
-std::unique_ptr<ThreadPool> ThPoolPtr;
-} // namespace
+} // namespace opts

 namespace llvm {
 namespace bolt {
 namespace ParallelUtilities {

-ThreadPool &getThreadPool() {
-  if (ThPoolPtr.get())
-    return *ThPoolPtr;
+namespace {
+/// A single thread pool that is used to run parallel tasks
+std::unique_ptr<ThreadPool> ThreadPoolPtr;

-  ThPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
-  return *ThPoolPtr;
+unsigned computeCostFor(const BinaryFunction &BF,
+                        const PredicateTy &SkipPredicate,
+                        const SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return 1;
+
+  if (SkipPredicate && SkipPredicate(BF))
+    return 0;
+
+  switch (SchedPolicy) {
+  case SchedulingPolicy::SP_CONSTANT:
+    return 1;
+  case SchedulingPolicy::SP_INST_LINEAR:
+    return BF.getSize();
+  case SchedulingPolicy::SP_INST_QUADRATIC:
+    return BF.getSize() * BF.getSize();
+  case SchedulingPolicy::SP_BB_LINEAR:
+    return BF.size();
+  case SchedulingPolicy::SP_BB_QUADRATIC:
+    return BF.size() * BF.size();
+  default:
+    llvm_unreachable("unsupported scheduling policy");
+  }
+}
+
+inline unsigned estimateTotalCost(const BinaryContext &BC,
+                                  const PredicateTy &SkipPredicate,
+                                  SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return BC.getBinaryFunctions().size();
+
+  unsigned TotalCost = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    auto &BF = BFI.second;
+    TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+  }
+
+  // Switch to trivial scheduling if total estimated work is zero
+  if (TotalCost == 0) {
+    outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
+              "switch to  trivial scheduling.\n";
+
+    SchedPolicy = SP_TRIVIAL;
+    TotalCost = BC.getBinaryFunctions().size();
+  }
+  return TotalCost;
+}
+
+} // namespace
+
+ThreadPool &getThreadPool() {
+  if (ThreadPoolPtr.get())
+    return *ThreadPoolPtr;
+
+  ThreadPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
+  return *ThreadPoolPtr;
 }

 void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
                       WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
-                       std::string LogName, unsigned TasksPerThread) {
+                       std::string LogName, bool ForceSequential,
+                       unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
    Timer T(LogName, LogName);
@ -75,65 +127,106 @@ void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
    DEBUG(T.stopTimer());
  };

-  if (opts::NoThreads) {
+  if (opts::NoThreads || ForceSequential) {
    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
    return;
  }

  // Estimate the overall runtime cost using the scheduling policy
-  unsigned TotalCost = 0;
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
-  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL) {
-    TotalCost = BC.getBinaryFunctions().size();
-  } else {
-    for (auto &BFI : BC.getBinaryFunctions()) {
-      auto &BF = BFI.second;
-
-      if (SkipPredicate && SkipPredicate(BF))
-        continue;
-
-      if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
-        TotalCost++;
-      else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
-        TotalCost += BF.size();
-      else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
-        TotalCost += BF.size() * BF.size();
-    }
-  }
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;

  // Divide work into blocks of equal cost
-  ThreadPool &ThPool = getThreadPool();
-  const unsigned BlockCost = TotalCost / BlocksCount;
+  ThreadPool &Pool = getThreadPool();
  auto BlockBegin = BC.getBinaryFunctions().begin();
  unsigned CurrentCost = 0;

  for (auto It = BC.getBinaryFunctions().begin();
       It != BC.getBinaryFunctions().end(); ++It) {
    auto &BF = It->second;
-
-    if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
-      CurrentCost++;
-    else {
-      if (SkipPredicate && SkipPredicate(BF))
-        continue;
-
-      if (SchedPolicy == SchedulingPolicy::SP_CONSTANT)
-        CurrentCost++;
-      else if (SchedPolicy == SchedulingPolicy::SP_LINEAR)
-        CurrentCost += BF.size();
-      else if (SchedPolicy == SchedulingPolicy::SP_QUADRATIC)
-        CurrentCost += BF.size() * BF.size();
-    }
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);

    if (CurrentCost >= BlockCost) {
-      ThPool.async(runBlock, BlockBegin, std::next(It));
+      Pool.async(runBlock, BlockBegin, std::next(It));
      BlockBegin = std::next(It);
      CurrentCost = 0;
    }
  }
-  ThPool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
-  ThPool.wait();
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
+  Pool.wait();
 }
+
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
+  std::shared_timed_mutex MainLock;
+  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
+                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
+                      MCPlusBuilder::AllocatorIdTy AllocId) {
+    Timer T(LogName, LogName);
+    DEBUG(T.startTimer());
+    std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
+    for (auto It = BlockBegin; It != BlockEnd; ++It) {
+      auto &BF = It->second;
+      if (SkipPredicate && SkipPredicate(BF))
+        continue;
+
+      WorkFunction(BF, AllocId);
+    }
+    DEBUG(T.stopTimer());
+  };
+
+  if (opts::NoThreads || ForceSequential) {
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
+    return;
+  }
+  // This lock is used to postpone task execution
+  std::unique_lock<std::shared_timed_mutex> Lock(MainLock);
+
+  // Estimate the overall runtime cost using the scheduling policy
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
+  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
+
+  // Divide work into blocks of equal cost
+  ThreadPool &Pool = getThreadPool();
+  auto BlockBegin = BC.getBinaryFunctions().begin();
+  unsigned CurrentCost = 0;
+  unsigned AllocId = 1;
+  for (auto It = BC.getBinaryFunctions().begin();
+       It != BC.getBinaryFunctions().end(); ++It) {
+    auto &BF = It->second;
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+
+    if (CurrentCost >= BlockCost) {
+      if (!BC.MIB->checkAllocatorExists(AllocId)) {
+        auto Id = BC.MIB->initializeNewAnnotationAllocator();
+        assert(AllocId == Id && "unexpected allocator id created");
+      }
+      Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
+      AllocId++;
+      BlockBegin = std::next(It);
+      CurrentCost = 0;
+    }
+  }
+
+  if (!BC.MIB->checkAllocatorExists(AllocId)) {
+    auto Id = BC.MIB->initializeNewAnnotationAllocator();
+    assert(AllocId == Id && "unexpected allocator id created");
+  }
+
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
+  Lock.unlock();
+  Pool.wait();
+}
+
 } // namespace ParallelUtilities
 } // namespace bolt
 } // namespace llvm
--- a/bolt/src/ParallelUtilities.h
+++ b/bolt/src/ParallelUtilities.h
@ -16,9 +16,10 @@
 #ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
 #define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H

-#include "llvm/Support/ThreadPool.h"
 #include "BinaryContext.h"
 #include "BinaryFunction.h"
+#include "MCPlusBuilder.h"
+#include "llvm/Support/ThreadPool.h"

 using namespace llvm;

@ -32,27 +33,45 @@ namespace llvm {
 namespace bolt {
 namespace ParallelUtilities {

+using WorkFuncWithAllocTy =
+    std::function<void(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy)>;
 using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
 using PredicateTy = std::function<bool(const BinaryFunction &BF)>;

 enum SchedulingPolicy {
-  SP_TRIVIAL,  /// cost is estimated by the number of functions
-  SP_CONSTANT, /// cost is estimated by the number of non-skipped functions
-  SP_LINEAR,   /// cost is estimated by the size of non-skipped functions
-  SP_QUADRATIC /// cost is estimated by the square of the size of non-skipped
-               /// functions
+  SP_TRIVIAL,     /// cost is estimated by the number of functions
+  SP_CONSTANT,    /// cost is estimated by the number of non-skipped functions
+  SP_INST_LINEAR, /// cost is estimated by inst count
+  SP_INST_QUADRATIC, /// cost is estimated by the square of the inst count
+  SP_BB_LINEAR,      /// cost is estimated by BB count
+  SP_BB_QUADRATIC,   /// cost is estimated by the square of the BB count
 };

 /// Return the managed threadpool and initialize it if not intiliazed
 ThreadPool &getThreadPool();

-// Perform the work on each binary function, except those that are accepted
-// by the SkipPredicate, scheduling heuristic is based on SchedPolicy
+/// Perform the work on each BinaryFunction except those that are accepted
+/// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
 void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
                       WorkFuncTy WorkFunction,
                       PredicateTy SkipPredicate = PredicateTy(),
-                       std::string LogName = "",
+                       std::string LogName = "", bool ForceSequential = false,
                       unsigned TasksPerThread = opts::TaskCount);
+
+/// Perform the work on each BinaryFunction except those that are rejected
+/// by SkipPredicate, and create a unique annotation allocator for each
+/// task. This should be used whenever the work function creates annotations to
+/// allow thread-safe annotation creation.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName = "", bool ForceSequential = false,
+    unsigned TasksPerThread = opts::TaskCount);
+
 } // namespace ParallelUtilities
 } // namespace bolt
 } // namespace llvm
--- a/bolt/src/Passes/BinaryPasses.cpp
+++ b/bolt/src/Passes/BinaryPasses.cpp
@ -355,7 +355,7 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
  };

  ParallelUtilities::runOnEachFunction(
-      BC, ParallelUtilities::SchedulingPolicy::SP_LINEAR, WorkFun, SkipFunc,
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
      "ReorderBasicBlocks");

  outs() << "BOLT-INFO: basic block reordering modified layout of "
--- a/bolt/src/Passes/IdenticalCodeFolding.cpp
+++ b/bolt/src/Passes/IdenticalCodeFolding.cpp
@ -34,7 +34,7 @@ UseDFS("icf-dfs",
  cl::ReallyHidden,
  cl::ZeroOrMore,
  cl::cat(BoltOptCategory));
-  
+
 static cl::opt<bool>
 TimeICF("time-icf",
  cl::desc("time icf steps"),
@ -354,7 +354,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
          // Make sure indices are in-order.
          BF.updateLayoutIndices();

-          // Pre-compute hash before pushing into hashtable. 
+          // Pre-compute hash before pushing into hashtable.
          BF.hash(/*Recompute=*/true, opts::UseDFS);
        }
        DEBUG(T.stopTimer());
@ -462,7 +462,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
      auto &Candidates = Entry.second;
      if (Candidates.size() < 2)
        continue;
-      
+
      if (opts::NoThreads)
        performFoldingForItem(Candidates);
      else
--- a/bolt/src/RewriteInstance.cpp
+++ b/bolt/src/RewriteInstance.cpp
@ -9,7 +9,6 @@
 //
 //===----------------------------------------------------------------------===//

-
 #include "RewriteInstance.h"
 #include "BinaryBasicBlock.h"
 #include "BinaryContext.h"
@ -23,6 +22,7 @@
 #include "Exceptions.h"
 #include "ExecutableFileMemoryManager.h"
 #include "MCPlusBuilder.h"
+#include "ParallelUtilities.h"
 #include "Passes/ReorderFunctions.h"
 #include "ProfileReader.h"
 #include "ProfileWriter.h"
@ -36,8 +36,8 @@
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCDwarf.h"
@ -60,8 +60,8 @@
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
@ -92,6 +92,7 @@ extern cl::opt<MacroFusionType> AlignMacroOpFusion;
 extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
+extern cl::opt<bool> TimeBuild;

 cl::opt<bool>
 Instrument("instrument-experimental",
@ -431,6 +432,12 @@ TimeRewrite("time-rewrite",
  cl::Hidden,
  cl::cat(BoltCategory));

+static cl::opt<bool>
+SequentialDisassembly("sequential-disassembly",
+  cl::desc("performs disassembly sequentially"),
+  cl::init(false),
+  cl::cat(BoltOptCategory));
+
 bool isHotTextMover(const BinaryFunction &Function) {
  for (auto &SectionName : opts::HotTextMoveSections) {
    if (Function.getOriginSectionName() == SectionName)
@ -766,7 +773,7 @@ void RewriteInstance::reset() {
  }
 }

-bool RewriteInstance::shouldDisassemble(BinaryFunction &BF) const {
+bool RewriteInstance::shouldDisassemble(const BinaryFunction &BF) const {
  // If we have to relocate the code we have to disassemble all functions.
  if (!BF.getBinaryContext().HasRelocations && !opts::shouldProcess(BF)) {
    DEBUG(dbgs() << "BOLT: skipping processing function " << BF
@ -1493,7 +1500,7 @@ void RewriteInstance::discoverFileObjects() {
      // Skip symbols from zero-sized sections.
      if (!Section->getSize())
        continue;
-      
+
      BF = BC->createBinaryFunction(UniqueName, *Section, Address,
                                    SymbolSize, IsSimple);
    }
@ -1894,12 +1901,12 @@ void RewriteInstance::readSpecialSections() {

  BC->HasRelocations = HasTextRelocations &&
                       (opts::RelocationMode != cl::BOU_FALSE);
- 
+
  // Force non-relocation mode for heatmap generation
  if (opts::HeatmapMode) {
    BC->HasRelocations = false;
  }
-  
+
  if (BC->HasRelocations) {
    outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
           << "relocation mode\n";
@ -2744,14 +2751,41 @@ void RewriteInstance::disassembleFunctions() {
    if (Function.getLSDAAddress() != 0)
      Function.parseLSDA(getLSDAData(), getLSDAAddress());

-    if (!Function.buildCFG())
-      continue;
-
-    if (opts::PrintAll)
-      Function.print(outs(), "while building cfg", true);
-
  } // Iterate over all functions

+  // Run buildCFG in parallel for all functions
+  {
+    NamedRegionTimer T("buildCFG", "buildCFG", "buildfuncs",
+                       "Build Binary Functions", opts::TimeBuild);
+
+    // Create annotation indices to allow lock-free execution
+    BC->MIB->getOrCreateAnnotationIndex("Offset");
+    BC->MIB->getOrCreateAnnotationIndex("JTIndexReg");
+    BC->MIB->getOrCreateAnnotationIndex("SDTMarker");
+
+    ParallelUtilities::WorkFuncWithAllocTy WorkFun =
+        [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
+          if (!BF.buildCFG(AllocId))
+            return;
+
+          if (opts::PrintAll) {
+            static std::mutex CriticalSectionMutex;
+            std::lock_guard<std::mutex> Lock(CriticalSectionMutex);
+            BF.print(outs(), "while building cfg", true);
+          }
+        };
+
+    ParallelUtilities::PredicateTy SkipPredicate =
+        [&](const BinaryFunction &BF) {
+          return !shouldDisassemble(BF) || !BF.isSimple();
+        };
+
+    ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+        *BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+        SkipPredicate, "disassembleFunctions-buildCFG",
+        /*ForceSequential*/ opts::SequentialDisassembly);
+  }
+
  BC->postProcessSymbolTable();
 }

--- a/bolt/src/RewriteInstance.h
+++ b/bolt/src/RewriteInstance.h
@ -266,7 +266,7 @@ private:
  bool willOverwriteSection(StringRef SectionName);

  /// Return true if the function \p BF should be disassembled.
-  bool shouldDisassemble(BinaryFunction &BF) const;
+  bool shouldDisassemble(const BinaryFunction &BF) const;

  /// Parse .note.stapsdt section
  void parseSDTNotes();