Rewrite #3 of machine block placement. This is based somewhat on the

second algorithm, but only loosely. It is more heavily based on the last discussion I had with Andy. It continues to walk from the inner-most loop outward, but there is a key difference. With this algorithm we ensure that as we visit each loop, the entire loop is merged into a single chain. At the end, the entire function is treated as a "loop", and merged into a single chain. This chain forms the desired sequence of blocks within the function. Switching to a single algorithm removes my biggest problem with the previous approaches -- they had different behavior depending on which system triggered the layout. Now there is exactly one algorithm and one basis for the decision making. The other key difference is how the chain is formed. This is based heavily on the idea Andy mentioned of keeping a worklist of blocks that are viable layout successors based on the CFG. Having this set allows us to consistently select the best layout successor for each block. It is expensive though. The code here remains very rough. There is a lot that needs to be done to clean up the code, and to make the runtime cost of this pass much lower. Very much WIP, but this was a giant chunk of code and I'd rather folks see it sooner than later. Everything remains behind a flag of course. I've added a couple of tests to exercise the issues that this iteration was motivated by: loop structure preservation. I've also fixed one test that was exhibiting the broken behavior of the previous version. llvm-svn: 144495
2011-11-13 11:20:44 +00:00 · 2011-11-13 11:20:44 +00:00 · 8d15078927
parent 1198d894d0
commit 8d15078927
2 changed files with 359 additions and 147 deletions
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@ -115,7 +115,7 @@ public:
  /// function. It also registers itself as the chain that block participates
  /// in with the BlockToChain mapping.
  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
-    : Blocks(1, BB), BlockToChain(BlockToChain) {
+    : Blocks(1, BB), BlockToChain(BlockToChain), LoopPredecessors(0) {
    assert(BB && "Cannot create a chain with a null basic block");
    BlockToChain[BB] = this;
  }
@ -138,7 +138,6 @@ public:
  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
    assert(BB);
    assert(!Blocks.empty());
-    assert(Blocks.back()->isSuccessor(BB));

    // Fast path in case we don't have a chain already.
    if (!Chain) {
@ -160,6 +159,12 @@ public:
      BlockToChain[*BI] = this;
    }
  }
+
+  /// \brief Count of predecessors within the loop currently being processed.
+  ///
+  /// This count is updated at each loop we process to represent the number of
+  /// in-loop predecessors of this chain.
+  unsigned LoopPredecessors;
 };
 }

@ -199,12 +204,15 @@ class MachineBlockPlacement : public MachineFunctionPass {
  /// between basic blocks.
  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;

-  BlockChain *CreateChain(MachineBasicBlock *BB);
-  void mergeSuccessor(MachineBasicBlock *BB, BlockChain *Chain,
-                      BlockFilterSet *Filter = 0);
+  void markChainSuccessors(BlockChain &Chain,
+                           MachineBasicBlock *LoopHeaderBB,
+                           SmallVectorImpl<MachineBasicBlock *> &Blocks,
+                           const BlockFilterSet *BlockFilter = 0);
+  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+                  SmallVectorImpl<MachineBasicBlock *> &Blocks,
+                  const BlockFilterSet *BlockFilter = 0);
  void buildLoopChains(MachineFunction &F, MachineLoop &L);
  void buildCFGChains(MachineFunction &F);
-  void placeChainsTopologically(MachineFunction &F);
  void AlignLoops(MachineFunction &F);

 public:
@ -264,96 +272,130 @@ static std::string getBlockNum(MachineBasicBlock *BB) {
 }
 #endif

-/// \brief Helper to create a new chain for a single BB.
-///
-/// Takes care of growing the Chains, setting up the BlockChain object, and any
-/// debug checking logic.
-/// \returns A pointer to the new BlockChain.
-BlockChain *MachineBlockPlacement::CreateChain(MachineBasicBlock *BB) {
-  BlockChain *Chain =
-    new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
-  return Chain;
-}
+void MachineBlockPlacement::markChainSuccessors(
+    BlockChain &Chain,
+    MachineBasicBlock *LoopHeaderBB,
+    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+    const BlockFilterSet *BlockFilter) {
+  // Walk all the blocks in this chain, marking their successors as having
+  // a predecessor placed.
+  for (BlockChain::iterator CBI = Chain.begin(), CBE = Chain.end();
+       CBI != CBE; ++CBI) {
+    // Add any successors for which this is the only un-placed in-loop
+    // predecessor to the worklist as a viable candidate for CFG-neutral
+    // placement. No subsequent placement of this block will violate the CFG
+    // shape, so we get to use heuristics to choose a favorable placement.
+    for (MachineBasicBlock::succ_iterator SI = (*CBI)->succ_begin(),
+                                          SE = (*CBI)->succ_end();
+         SI != SE; ++SI) {
+      if (BlockFilter && !BlockFilter->count(*SI))
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      // Disregard edges within a fixed chain, or edges to the loop header.
+      if (&Chain == &SuccChain || *SI == LoopHeaderBB)
+        continue;

-/// \brief Merge a chain with any viable successor.
-///
-/// This routine walks the predecessors of the current block, looking for
-/// viable merge candidates. It has strict rules it uses to determine when
-/// a predecessor can be merged with the current block, which center around
-/// preserving the CFG structure. It performs the merge if any viable candidate
-/// is found.
-void MachineBlockPlacement::mergeSuccessor(MachineBasicBlock *BB,
-                                           BlockChain *Chain,
-                                           BlockFilterSet *Filter) {
-  assert(BB);
-  assert(Chain);
-
-  // If this block is not at the end of its chain, it cannot merge with any
-  // other chain.
-  if (Chain && *llvm::prior(Chain->end()) != BB)
-    return;
-
-  // Walk through the successors looking for the highest probability edge.
-  MachineBasicBlock *Successor = 0;
-  BranchProbability BestProb = BranchProbability::getZero();
-  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
-  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-                                        SE = BB->succ_end();
-       SI != SE; ++SI) {
-    if (BB == *SI || (Filter && !Filter->count(*SI)))
-      continue;
-
-    BranchProbability SuccProb = MBPI->getEdgeProbability(BB, *SI);
-    DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb << "\n");
-    if (!Successor || SuccProb > BestProb || (!(SuccProb < BestProb) &&
-                                              BB->isLayoutSuccessor(*SI))) {
-      Successor = *SI;
-      BestProb = SuccProb;
+      // This is a cross-chain edge that is within the loop, so decrement the
+      // loop predecessor count of the destination chain.
+      if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
+        BlockWorkList.push_back(*SI);
    }
  }
-  if (!Successor)
-    return;
+}

-  // Grab a chain if it exists already for this successor and make sure the
-  // successor is at the start of the chain as we can't merge mid-chain. Also,
-  // if the successor chain is the same as our chain, we're already merged.
-  BlockChain *SuccChain = BlockToChain[Successor];
-  if (SuccChain && (SuccChain == Chain || Successor != *SuccChain->begin()))
-    return;
+void MachineBlockPlacement::buildChain(
+    MachineBasicBlock *BB,
+    BlockChain &Chain,
+    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+    const BlockFilterSet *BlockFilter) {
+  const BranchProbability HotProb(4, 5); // 80%
+  assert(BB);
+  assert(BlockToChain[BB] == &Chain);
+  assert(*Chain.begin() == BB);
+  MachineBasicBlock *LoopHeaderBB = BB;
+  markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
+  BB = *llvm::prior(Chain.end());
+  for (;;) {
+    assert(BB);
+    assert(BlockToChain[BB] == &Chain);
+    assert(*llvm::prior(Chain.end()) == BB);

-  // We only merge chains across a CFG merge when the desired merge path is
-  // significantly hotter than the incoming edge. We define a hot edge more
-  // strictly than the BranchProbabilityInfo does, as the two predecessor
-  // blocks may have dramatically different incoming probabilities we need to
-  // account for. Therefor we use the "global" edge weight which is the
-  // branch's probability times the block frequency of the predecessor.
-  BlockFrequency MergeWeight = MBFI->getBlockFreq(BB);
-  MergeWeight *= MBPI->getEdgeProbability(BB, Successor);
-  // We only want to consider breaking the CFG when the merge weight is much
-  // higher (80% vs. 20%), so multiply it by 1/4. This will require the merged
-  // edge to be 4x more likely before we disrupt the CFG. This number matches
-  // the definition of "hot" in BranchProbabilityAnalysis (80% vs. 20%).
-  MergeWeight *= BranchProbability(1, 4);
-  for (MachineBasicBlock::pred_iterator PI = Successor->pred_begin(),
-                                        PE = Successor->pred_end();
-       PI != PE; ++PI) {
-    if (BB == *PI || Successor == *PI) continue;
-    BlockFrequency PredWeight = MBFI->getBlockFreq(*PI);
-    PredWeight *= MBPI->getEdgeProbability(*PI, Successor);
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    MachineBasicBlock *BestSucc = 0;
+    BranchProbability BestProb = BranchProbability::getZero();
+    DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                          SE = BB->succ_end();
+         SI != SE; ++SI) {
+      if (BlockFilter && !BlockFilter->count(*SI))
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      if (&SuccChain == &Chain) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
+        continue;
+      }

-    // Return on the first predecessor we find which outstrips our merge weight.
-    if (MergeWeight < PredWeight)
+      BranchProbability SuccProb = MBPI->getEdgeProbability(BB, *SI);
+
+      // Only consider successors which are either "hot", or wouldn't violate
+      // any CFG constraints.
+      if (SuccChain.LoopPredecessors != 0 && SuccProb < HotProb) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
+        continue;
+      }
+
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+                   << " (prob)"
+                   << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
+                   << "\n");
+      if (BestSucc && BestProb >= SuccProb)
+        continue;
+      BestSucc = *SI;
+      BestProb = SuccProb;
+    }
+
+    // If an immediate successor isn't available, look for the best viable
+    // block among those we've identified as not violating the loop's CFG at
+    // this point. This won't be a fallthrough, but it will increase locality.
+    if (!BestSucc) {
+      BlockFrequency BestFreq;
+      for (SmallVectorImpl<MachineBasicBlock *>::iterator WBI = BlockWorkList.begin(),
+                                                          WBE = BlockWorkList.end();
+           WBI != WBE; ++WBI) {
+        if (BlockFilter && !BlockFilter->count(*WBI))
+          continue;
+        BlockChain &SuccChain = *BlockToChain[*WBI];
+        if (&SuccChain == &Chain) {
+          DEBUG(dbgs() << "    " << getBlockName(*WBI)
+                       << " -> Already merged!\n");
+          continue;
+        }
+        assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block");
+
+        BlockFrequency CandidateFreq = MBFI->getBlockFreq(*WBI);
+        DEBUG(dbgs() << "    " << getBlockName(*WBI) << " -> " << CandidateFreq
+                     << " (freq)\n");
+        if (BestSucc && BestFreq >= CandidateFreq)
+          continue;
+        BestSucc = *WBI;
+        BestFreq = CandidateFreq;
+      }
+    }
+    if (!BestSucc) {
+      DEBUG(dbgs() << "Finished forming chain for header block "
+                   << getBlockNum(*Chain.begin()) << "\n");
      return;
-    DEBUG(dbgs() << "Breaking CFG edge!\n"
-                 << "  Edge from " << getBlockNum(BB) << " to "
-                 << getBlockNum(Successor) << ": " << MergeWeight << "\n"
-                 << "        vs. " << getBlockNum(BB) << " to "
-                 << getBlockNum(*PI) << ": " << PredWeight << "\n");
-  }
+    }

-  DEBUG(dbgs() << "Merging from " << getBlockNum(BB) << " to "
-               << getBlockNum(Successor) << "\n");
-  Chain->merge(Successor, SuccChain);
+    // Place this block, updating the datastructures to reflect its placement.
+    BlockChain &SuccChain = *BlockToChain[BestSucc];
+    DEBUG(dbgs() << "Merging from " << getBlockNum(BB)
+                 << " to " << getBlockNum(BestSucc) << "\n");
+    markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter);
+    Chain.merge(BestSucc, &SuccChain);
+    BB = *llvm::prior(Chain.end());
+  }
 }

 /// \brief Forms basic block chains from the natural loop structures.
@ -362,86 +404,162 @@ void MachineBlockPlacement::mergeSuccessor(MachineBasicBlock *BB,
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineFunction &F, MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
+                                            MachineLoop &L) {
  // First recurse through any nested loops, building chains for those inner
  // loops.
  for (MachineLoop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI)
    buildLoopChains(F, **LI);

-  SmallPtrSet<MachineBasicBlock *, 16> LoopBlockSet(L.block_begin(),
-                                                    L.block_end());
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());

-  // Begin building up a set of chains of blocks within this loop which should
-  // remain contiguous. Some of the blocks already belong to a chain which
-  // represents an inner loop.
-  for (MachineLoop::block_iterator BI = L.block_begin(), BE = L.block_end();
+  // FIXME: This is a really lame way of walking the chains in the loop: we
+  // walk the blocks, and use a set to prevent visiting a particular chain
+  // twice.
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  for (MachineLoop::block_iterator BI = L.block_begin(),
+                                   BE = L.block_end();
       BI != BE; ++BI) {
-    MachineBasicBlock *BB = *BI;
-    BlockChain *Chain = BlockToChain[BB];
-    if (!Chain) Chain = CreateChain(BB);
-    mergeSuccessor(BB, Chain, &LoopBlockSet);
+    BlockChain &Chain = *BlockToChain[*BI];
+    if (!UpdatedPreds.insert(&Chain) || BI == L.block_begin())
+      continue;
+
+    assert(Chain.LoopPredecessors == 0);
+    for (BlockChain::iterator BCI = Chain.begin(), BCE = Chain.end();
+         BCI != BCE; ++BCI) {
+      assert(BlockToChain[*BCI] == &Chain);
+      for (MachineBasicBlock::pred_iterator PI = (*BCI)->pred_begin(),
+                                            PE = (*BCI)->pred_end();
+           PI != PE; ++PI) {
+        if (BlockToChain[*PI] == &Chain || !LoopBlockSet.count(*PI))
+          continue;
+        ++Chain.LoopPredecessors;
+      }
+    }
+
+    if (Chain.LoopPredecessors == 0)
+      BlockWorkList.push_back(*BI);
  }
+
+  BlockChain &LoopChain = *BlockToChain[L.getHeader()];
+  buildChain(*L.block_begin(), LoopChain, BlockWorkList, &LoopBlockSet);
+
+  DEBUG({
+    if (LoopChain.LoopPredecessors)
+      dbgs() << "Loop chain contains a block without its preds placed!\n"
+             << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+             << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
+    for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end();
+         BCI != BCE; ++BCI)
+      if (!LoopBlockSet.erase(*BCI))
+        dbgs() << "Loop chain contains a block not contained by the loop!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(*BCI) << "\n";
+
+    if (!LoopBlockSet.empty())
+      for (SmallPtrSet<MachineBasicBlock *, 16>::iterator LBI = LoopBlockSet.begin(), LBE = LoopBlockSet.end();
+           LBI != LBE; ++LBI)
+        dbgs() << "Loop contains blocks never placed into a chain!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(*LBI) << "\n";
+  });
 }

 void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
-  // First build any loop-based chains.
+  // Ensure that every BB in the function has an associated chain to simplify
+  // the assumptions of the remaining algorithm.
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    BlockToChain[&*FI] =
+      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, &*FI);
+
+  // Build any loop-based chains.
  for (MachineLoopInfo::iterator LI = MLI->begin(), LE = MLI->end(); LI != LE;
       ++LI)
    buildLoopChains(F, **LI);

-  // Now walk the blocks of the function forming chains where they don't
-  // violate any CFG structure.
-  for (MachineFunction::iterator BI = F.begin(), BE = F.end();
-       BI != BE; ++BI) {
-    MachineBasicBlock *BB = BI;
-    BlockChain *Chain = BlockToChain[BB];
-    if (!Chain) Chain = CreateChain(BB);
-    mergeSuccessor(BB, Chain);
-  }
-}
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;

-void MachineBlockPlacement::placeChainsTopologically(MachineFunction &F) {
-  MachineBasicBlock *EntryB = &F.front();
-  assert(BlockToChain[EntryB] && "Missing chain for entry block");
-  assert(*BlockToChain[EntryB]->begin() == EntryB &&
-         "Entry block is not the head of the entry block chain");
-
-  // Walk the blocks in RPO, and insert each block for a chain in order the
-  // first time we see that chain.
-  MachineFunction::iterator InsertPos = F.begin();
-  SmallPtrSet<BlockChain *, 16> VisitedChains;
-  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(EntryB);
-  typedef ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
-    rpo_iterator;
-  for (rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BlockChain *Chain = BlockToChain[*I];
-    assert(Chain);
-    if(!VisitedChains.insert(Chain))
-      continue;
-    for (BlockChain::iterator BI = Chain->begin(), BE = Chain->end(); BI != BE;
-         ++BI) {
-      DEBUG(dbgs() << (BI == Chain->begin() ? "Placing chain "
-                                            : "          ... ")
-                   << getBlockName(*BI) << "\n");
-      if (InsertPos != MachineFunction::iterator(*BI))
-        F.splice(InsertPos, *BI);
-      else
-        ++InsertPos;
-    }
-  }
-
-  // Now that every block is in its final position, update all of the
-  // terminators.
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = &*FI;
+    BlockChain &Chain = *BlockToChain[BB];
+    if (!UpdatedPreds.insert(&Chain))
+      continue;
+
+    assert(Chain.LoopPredecessors == 0);
+    for (BlockChain::iterator BCI = Chain.begin(), BCE = Chain.end();
+         BCI != BCE; ++BCI) {
+      assert(BlockToChain[*BCI] == &Chain);
+      for (MachineBasicBlock::pred_iterator PI = (*BCI)->pred_begin(),
+                                            PE = (*BCI)->pred_end();
+           PI != PE; ++PI) {
+        if (BlockToChain[*PI] == &Chain)
+          continue;
+        ++Chain.LoopPredecessors;
+      }
+    }
+
+    if (Chain.LoopPredecessors == 0)
+      BlockWorkList.push_back(BB);
+  }
+
+  BlockChain &FunctionChain = *BlockToChain[&F.front()];
+  buildChain(&F.front(), FunctionChain, BlockWorkList);
+
+  typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
+  DEBUG({
+    FunctionBlockSetType FunctionBlockSet;
+    for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+      FunctionBlockSet.insert(FI);
+
+    for (BlockChain::iterator BCI = FunctionChain.begin(), BCE = FunctionChain.end();
+         BCI != BCE; ++BCI)
+      if (!FunctionBlockSet.erase(*BCI))
+        dbgs() << "Function chain contains a block not in the function!\n"
+               << "  Bad block:    " << getBlockName(*BCI) << "\n";
+
+    if (!FunctionBlockSet.empty())
+      for (SmallPtrSet<MachineBasicBlock *, 16>::iterator FBI = FunctionBlockSet.begin(),
+           FBE = FunctionBlockSet.end(); FBI != FBE; ++FBI)
+        dbgs() << "Function contains blocks never placed into a chain!\n"
+               << "  Bad block:    " << getBlockName(*FBI) << "\n";
+  });
+
+  // Splice the blocks into place.
+  MachineFunction::iterator InsertPos = F.begin();
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (BlockChain::iterator BI = FunctionChain.begin(), BE = FunctionChain.end();
+       BI != BE; ++BI) {
+    DEBUG(dbgs() << (BI == FunctionChain.begin() ? "Placing chain "
+                                                  : "          ... ")
+          << getBlockName(*BI) << "\n");
+    if (InsertPos != MachineFunction::iterator(*BI))
+      F.splice(InsertPos, *BI);
+    else
+      ++InsertPos;
+
+    // Update the terminator of the previous block.
+    if (BI == FunctionChain.begin())
+      continue;
+    MachineBasicBlock *PrevBB = llvm::prior(MachineFunction::iterator(*BI));
+
    // FIXME: It would be awesome of updateTerminator would just return rather
    // than assert when the branch cannot be analyzed in order to remove this
    // boiler plate.
    Cond.clear();
    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (!TII->AnalyzeBranch(*FI, TBB, FBB, Cond))
-      FI->updateTerminator();
+    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond))
+      PrevBB->updateTerminator();
  }
+
+  // Fixup the last block.
+  Cond.clear();
+  MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+  if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
+    F.back().updateTerminator();
 }

 /// \brief Recursive helper to align a loop and any nested loops.
@ -479,7 +597,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
  assert(BlockToChain.empty());

  buildCFGChains(F);
-  placeChainsTopologically(F);
  AlignLoops(F);

  BlockToChain.clear();
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@ -72,8 +72,103 @@ exit:
  ret i32 %b
 }

+define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
+; Check that we sink cold loop blocks after the hot loop body.
+; CHECK: test_loop_cold_blocks:
+; CHECK: %entry
+; CHECK: %body1
+; CHECK: %body2
+; CHECK: %body3
+; CHECK: %unlikely1
+; CHECK: %unlikely2
+; CHECK: %exit
+
+entry:
+  br label %body1
+
+body1:
+  %iv = phi i32 [ 0, %entry ], [ %next, %body3 ]
+  %base = phi i32 [ 0, %entry ], [ %sum, %body3 ]
+  %unlikelycond1 = icmp slt i32 %base, 42
+  br i1 %unlikelycond1, label %unlikely1, label %body2, !prof !0
+
+unlikely1:
+  call void @error(i32 %i, i32 1, i32 %base)
+  br label %body2
+
+body2:
+  %unlikelycond2 = icmp sgt i32 %base, 21
+  br i1 %unlikelycond2, label %unlikely2, label %body3, !prof !0
+
+unlikely2:
+  call void @error(i32 %i, i32 2, i32 %base)
+  br label %body3
+
+body3:
+  %arrayidx = getelementptr inbounds i32* %a, i32 %iv
+  %0 = load i32* %arrayidx
+  %sum = add nsw i32 %0, %base
+  %next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %next, %i
+  br i1 %exitcond, label %exit, label %body1
+
+exit:
+  ret i32 %sum
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 4, i32 64}

+define i32 @test_loop_early_exits(i32 %i, i32* %a) {
+; Check that we sink early exit blocks out of loop bodies.
+; CHECK: test_loop_early_exits:
+; CHECK: %entry
+; CHECK: %body1
+; CHECK: %body2
+; CHECK: %body3
+; CHECK: %body4
+; CHECK: %exit
+; CHECK: %bail1
+; CHECK: %bail2
+; CHECK: %bail3
+
+entry:
+  br label %body1
+
+body1:
+  %iv = phi i32 [ 0, %entry ], [ %next, %body4 ]
+  %base = phi i32 [ 0, %entry ], [ %sum, %body4 ]
+  %bailcond1 = icmp eq i32 %base, 42
+  br i1 %bailcond1, label %bail1, label %body2
+
+bail1:
+  ret i32 -1
+
+body2:
+  %bailcond2 = icmp eq i32 %base, 43
+  br i1 %bailcond2, label %bail2, label %body3
+
+bail2:
+  ret i32 -2
+
+body3:
+  %bailcond3 = icmp eq i32 %base, 44
+  br i1 %bailcond3, label %bail3, label %body4
+
+bail3:
+  ret i32 -3
+
+body4:
+  %arrayidx = getelementptr inbounds i32* %a, i32 %iv
+  %0 = load i32* %arrayidx
+  %sum = add nsw i32 %0, %base
+  %next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %next, %i
+  br i1 %exitcond, label %exit, label %body1
+
+exit:
+  ret i32 %sum
+}
+
 define i32 @test_loop_align(i32 %i, i32* %a) {
 ; Check that we provide basic loop body alignment with the block placement
 ; pass.
@ -105,7 +200,7 @@ define i32 @test_nested_loop_align(i32 %i, i32* %a, i32* %b) {
 ; CHECK: test_nested_loop_align:
 ; CHECK: %entry
 ; CHECK: .align [[ALIGN]],
-; CHECK-NEXT: %loop.body.2
+; CHECK-NEXT: %loop.body.1
 ; CHECK: .align [[ALIGN]],
 ; CHECK-NEXT: %inner.loop.body
 ; CHECK-NOT: .align