Enhance loop rotation with existence of profile data in MachineBlockPlacement pass.

Currently, in MachineBlockPlacement pass the loop is rotated to let the best exit to be the last BB in the loop chain, to maximize the fall-through from the loop to outside. With profile data, we can determine the cost in terms of missed fall through opportunities when rotating a loop chain and select the best rotation. Basically, there are three kinds of cost to consider for each rotation:

1. The possibly missed fall through edge (if it exists) from BB out of the loop to the loop header.
2. The possibly missed fall through edges (if they exist) from the loop exits to BB out of the loop.
3. The missed fall through edge (if it exists) from the last BB to the first BB in the loop chain.

Therefore, the cost for a given rotation is the sum of costs listed above. We select the best rotation with the smallest cost. This is only for PGO mode when we have more precise edge frequencies.

Differential revision: http://reviews.llvm.org/D10717

llvm-svn: 250754
This commit is contained in:
Cong Hou 2015-10-19 23:16:40 +00:00
parent 6681041d70
commit 7745dbc5c4
3 changed files with 386 additions and 3 deletions

View File

@ -81,6 +81,23 @@ static cl::opt<unsigned> OutlineOptionalThreshold(
"instruction count below this threshold"),
cl::init(4), cl::Hidden);
static cl::opt<bool>
PreciseRotationCost("precise-rotation-cost",
cl::desc("Model the cost of loop rotation more "
"precisely by using profile data."),
cl::init(false), cl::Hidden);
static cl::opt<unsigned> MisfetchCost(
"misfetch-cost",
cl::desc("Cost that models the probablistic risk of an instruction "
"misfetch due to a jump comparing to falling through, whose cost "
"is zero."),
cl::init(1), cl::Hidden);
static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
cl::desc("Cost of jump instructions."),
cl::init(1), cl::Hidden);
namespace {
class BlockChain;
/// \brief Type for our function-wide basic block -> block chain mapping.
@ -249,6 +266,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
void buildLoopChains(MachineFunction &F, MachineLoop &L);
void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
const BlockFilterSet &LoopBlockSet);
void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
const BlockFilterSet &LoopBlockSet);
void buildCFGChains(MachineFunction &F);
public:
@ -791,6 +810,156 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
}
/// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
///
/// With profile data, we can determine the cost in terms of missed fall through
/// opportunities when rotating a loop chain and select the best rotation.
/// Basically, there are three kinds of cost to consider for each rotation:
/// 1. The possibly missed fall through edge (if it exists) from BB out of
/// the loop to the loop header.
/// 2. The possibly missed fall through edges (if they exist) from the loop
/// exits to BB out of the loop.
/// 3. The missed fall through edge (if it exists) from the last BB to the
/// first BB in the loop chain.
/// Therefore, the cost for a given rotation is the sum of costs listed above.
/// We select the best rotation with the smallest cost.
void MachineBlockPlacement::rotateLoopWithProfile(
BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
auto HeaderBB = L.getHeader();
auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB);
auto RotationPos = LoopChain.end();
BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
// A utility lambda that scales up a block frequency by dividing it by a
// branch probability which is the reciprocal of the scale.
auto ScaleBlockFrequency = [](BlockFrequency Freq,
unsigned Scale) -> BlockFrequency {
if (Scale == 0)
return 0;
// Use operator / between BlockFrequency and BranchProbability to implement
// saturating multiplication.
return Freq / BranchProbability(1, Scale);
};
// Compute the cost of the missed fall-through edge to the loop header if the
// chain head is not the loop header. As we only consider natural loops with
// single header, this computation can be done only once.
BlockFrequency HeaderFallThroughCost(0);
for (auto *Pred : HeaderBB->predecessors()) {
BlockChain *PredChain = BlockToChain[Pred];
if (!LoopBlockSet.count(Pred) &&
(!PredChain || Pred == *std::prev(PredChain->end()))) {
auto EdgeFreq =
MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
// If the predecessor has only an unconditional jump to the header, we
// need to consider the cost of this jump.
if (Pred->succ_size() == 1)
FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
}
}
// Here we collect all exit blocks in the loop, and for each exit we find out
// its hottest exit edge. For each loop rotation, we define the loop exit cost
// as the sum of frequencies of exit edges we collect here, excluding the exit
// edge from the tail of the loop chain.
SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
for (auto BB : LoopChain) {
uint32_t LargestExitEdgeWeight = 0;
for (auto *Succ : BB->successors()) {
BlockChain *SuccChain = BlockToChain[Succ];
if (!LoopBlockSet.count(Succ) &&
(!SuccChain || Succ == *SuccChain->begin())) {
uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight);
}
}
if (LargestExitEdgeWeight > 0) {
uint32_t WeightScale = 0;
uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
auto ExitFreq =
MBFI->getBlockFreq(BB) *
BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight);
ExitsWithFreq.emplace_back(BB, ExitFreq);
}
}
// In this loop we iterate every block in the loop chain and calculate the
// cost assuming the block is the head of the loop chain. When the loop ends,
// we should have found the best candidate as the loop chain's head.
for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
EndIter = LoopChain.end();
Iter != EndIter; Iter++, TailIter++) {
// TailIter is used to track the tail of the loop chain if the block we are
// checking (pointed by Iter) is the head of the chain.
if (TailIter == LoopChain.end())
TailIter = LoopChain.begin();
auto TailBB = *TailIter;
// Calculate the cost by putting this BB to the top.
BlockFrequency Cost = 0;
// If the current BB is the loop header, we need to take into account the
// cost of the missed fall through edge from outside of the loop to the
// header.
if (Iter != HeaderIter)
Cost += HeaderFallThroughCost;
// Collect the loop exit cost by summing up frequencies of all exit edges
// except the one from the chain tail.
for (auto &ExitWithFreq : ExitsWithFreq)
if (TailBB != ExitWithFreq.first)
Cost += ExitWithFreq.second;
// The cost of breaking the once fall-through edge from the tail to the top
// of the loop chain. Here we need to consider three cases:
// 1. If the tail node has only one successor, then we will get an
// additional jmp instruction. So the cost here is (MisfetchCost +
// JumpInstCost) * tail node frequency.
// 2. If the tail node has two successors, then we may still get an
// additional jmp instruction if the layout successor after the loop
// chain is not its CFG successor. Note that the more frequently executed
// jmp instruction will be put ahead of the other one. Assume the
// frequency of those two branches are x and y, where x is the frequency
// of the edge to the chain head, then the cost will be
// (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
// 3. If the tail node has more than two successors (this rarely happens),
// we won't consider any additional cost.
if (TailBB->isSuccessor(*Iter)) {
auto TailBBFreq = MBFI->getBlockFreq(TailBB);
if (TailBB->succ_size() == 1)
Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
MisfetchCost + JumpInstCost);
else if (TailBB->succ_size() == 2) {
auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
? TailBBFreq * TailToHeadProb.getCompl()
: TailToHeadFreq;
Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
}
}
DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter)
<< " to the top: " << Cost.getFrequency() << "\n");
if (Cost < SmallestRotationCost) {
SmallestRotationCost = Cost;
RotationPos = Iter;
}
}
if (RotationPos != LoopChain.end()) {
DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos)
<< " to the top\n");
std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
}
}
/// \brief Forms basic block chains from the natural loop structures.
///
/// These chains are designed to preserve the existing *structure* of the code
@ -807,17 +976,25 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
SmallVector<MachineBasicBlock *, 16> BlockWorkList;
BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
// Check if we have profile data for this function. If yes, we will rotate
// this loop by modeling costs more precisely which requires the profile data
// for better layout.
bool RotateLoopWithProfile =
PreciseRotationCost && F.getFunction()->getEntryCount();
// First check to see if there is an obviously preferable top block for the
// loop. This will default to the header, but may end up as one of the
// predecessors to the header if there is one which will result in strictly
// fewer branches in the loop body.
MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
// When we use profile data to rotate the loop, this is unnecessary.
MachineBasicBlock *LoopTop =
RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
// If we selected just the header for the loop top, look for a potentially
// profitable exit block in the event that rotating the loop can eliminate
// branches by placing an exit edge at the bottom.
MachineBasicBlock *ExitingBB = nullptr;
if (LoopTop == L.getHeader())
if (!RotateLoopWithProfile && LoopTop == L.getHeader())
ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
BlockChain &LoopChain = *BlockToChain[LoopTop];
@ -848,6 +1025,10 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
}
buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet);
if (RotateLoopWithProfile)
rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
else
rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
DEBUG({

View File

@ -0,0 +1,80 @@
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
; Test that not all edges in the loop chain are fall through without profile
; data.
;
; CHECK-LABEL: foo:
; CHECK: callq e
; CHECK: callq f
; CHECK: callq g
; CHECK: callq h
entry:
br label %header
header:
call void @e()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !2
if.then:
call void @f()
br label %if.end
if.else:
call void @g()
br label %if.end
if.end:
call void @h()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header, label %end
end:
ret void
}
define void @bar() !prof !1 {
; Test that all edges in the loop chain are fall through with profile data.
;
; CHECK-PROFILE-LABEL: bar:
; CHECK-PROFILE: callq g
; CHECK-PROFILE: callq h
; CHECK-PROFILE: callq e
; CHECK-PROFILE: callq f
entry:
br label %header
header:
call void @e()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !2
if.then:
call void @f()
br label %if.end
if.else:
call void @g()
br label %if.end
if.end:
call void @h()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header, label %end
end:
ret void
}
declare zeroext i1 @a()
declare void @e()
declare void @f()
declare void @g()
declare void @h()
!1 = !{!"function_entry_count", i64 1}
!2 = !{!"branch_weights", i32 16, i32 16}

View File

@ -0,0 +1,122 @@
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
; Test a nested loop case when profile data is not available.
;
; CHECK-LABEL: foo:
; CHECK: callq b
; CHECK: callq c
; CHECK: callq d
; CHECK: callq e
; CHECK: callq f
; CHECK: callq g
; CHECK: callq h
entry:
br label %header
header:
call void @b()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !2
if.then:
br label %header2
header2:
call void @c()
%call1 = call zeroext i1 @a()
br i1 %call1, label %if.then2, label %if.else2, !prof !2
if.then2:
call void @d()
br label %if.end2
if.else2:
call void @e()
br label %if.end2
if.end2:
call void @f()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header2, label %if.end
if.else:
call void @g()
br label %if.end
if.end:
call void @h()
%call3 = call zeroext i1 @a()
br i1 %call3, label %header, label %end
end:
ret void
}
define void @bar() !prof !1 {
; Test a nested loop case when profile data is available.
;
; CHECK-PROFILE-LABEL: bar:
; CHECK-PROFILE: callq e
; CHECK-PROFILE: callq f
; CHECK-PROFILE: callq c
; CHECK-PROFILE: callq d
; CHECK-PROFILE: callq h
; CHECK-PROFILE: callq b
; CHECK-PROFILE: callq g
entry:
br label %header
header:
call void @b()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !2
if.then:
br label %header2
header2:
call void @c()
%call1 = call zeroext i1 @a()
br i1 %call1, label %if.then2, label %if.else2, !prof !2
if.then2:
call void @d()
br label %if.end2
if.else2:
call void @e()
br label %if.end2
if.end2:
call void @f()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header2, label %if.end
if.else:
call void @g()
br label %if.end
if.end:
call void @h()
%call3 = call zeroext i1 @a()
br i1 %call3, label %header, label %end
end:
ret void
}
declare zeroext i1 @a()
declare void @b()
declare void @c()
declare void @d()
declare void @e()
declare void @f()
declare void @g()
declare void @h()
!1 = !{!"function_entry_count", i64 1}
!2 = !{!"branch_weights", i32 16, i32 16}