forked from OSchip/llvm-project
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.
In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality. Consider a simple example shown below: A | B⇆C | D When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D. This patch filters those cold blocks off from the loop chain by comparing the ratio: LoopBBFreq / LoopFreq to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa. Differential revision: http://reviews.llvm.org/D11662 llvm-svn: 251833
This commit is contained in:
parent
935d79b0b1
commit
b90b9e0531
|
@ -81,6 +81,12 @@ static cl::opt<unsigned> OutlineOptionalThreshold(
|
|||
"instruction count below this threshold"),
|
||||
cl::init(4), cl::Hidden);
|
||||
|
||||
static cl::opt<unsigned> LoopToColdBlockRatio(
|
||||
"loop-to-cold-block-ratio",
|
||||
cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
|
||||
"(frequency of block) is greater than this ratio"),
|
||||
cl::init(5), cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
PreciseRotationCost("precise-rotation-cost",
|
||||
cl::desc("Model the cost of loop rotation more "
|
||||
|
@ -263,6 +269,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
|
|||
const BlockFilterSet &LoopBlockSet);
|
||||
MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
|
||||
const BlockFilterSet &LoopBlockSet);
|
||||
BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L);
|
||||
void buildLoopChains(MachineFunction &F, MachineLoop &L);
|
||||
void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
|
||||
const BlockFilterSet &LoopBlockSet);
|
||||
|
@ -960,6 +967,42 @@ void MachineBlockPlacement::rotateLoopWithProfile(
|
|||
}
|
||||
}
|
||||
|
||||
/// \brief Collect blocks in the given loop that are to be placed.
|
||||
///
|
||||
/// When profile data is available, exclude cold blocks from the returned set;
|
||||
/// otherwise, collect all blocks in the loop.
|
||||
MachineBlockPlacement::BlockFilterSet
|
||||
MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
|
||||
BlockFilterSet LoopBlockSet;
|
||||
|
||||
// Filter cold blocks off from LoopBlockSet when profile data is available.
|
||||
// Collect the sum of frequencies of incoming edges to the loop header from
|
||||
// outside. If we treat the loop as a super block, this is the frequency of
|
||||
// the loop. Then for each block in the loop, we calculate the ratio between
|
||||
// its frequency and the frequency of the loop block. When it is too small,
|
||||
// don't add it to the loop chain. If there are outer loops, then this block
|
||||
// will be merged into the first outer loop chain for which this block is not
|
||||
// cold anymore. This needs precise profile data and we only do this when
|
||||
// profile data is available.
|
||||
if (F.getFunction()->getEntryCount()) {
|
||||
BlockFrequency LoopFreq(0);
|
||||
for (auto LoopPred : L.getHeader()->predecessors())
|
||||
if (!L.contains(LoopPred))
|
||||
LoopFreq += MBFI->getBlockFreq(LoopPred) *
|
||||
MBPI->getEdgeProbability(LoopPred, L.getHeader());
|
||||
|
||||
for (MachineBasicBlock *LoopBB : L.getBlocks()) {
|
||||
auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
|
||||
if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
|
||||
continue;
|
||||
LoopBlockSet.insert(LoopBB);
|
||||
}
|
||||
} else
|
||||
LoopBlockSet.insert(L.block_begin(), L.block_end());
|
||||
|
||||
return LoopBlockSet;
|
||||
}
|
||||
|
||||
/// \brief Forms basic block chains from the natural loop structures.
|
||||
///
|
||||
/// These chains are designed to preserve the existing *structure* of the code
|
||||
|
@ -974,7 +1017,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
|
|||
buildLoopChains(F, *InnerLoop);
|
||||
|
||||
SmallVector<MachineBasicBlock *, 16> BlockWorkList;
|
||||
BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
|
||||
BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
|
||||
|
||||
// Check if we have profile data for this function. If yes, we will rotate
|
||||
// this loop by modeling costs more precisely which requires the profile data
|
||||
|
@ -1005,7 +1048,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
|
|||
SmallPtrSet<BlockChain *, 4> UpdatedPreds;
|
||||
assert(LoopChain.LoopPredecessors == 0);
|
||||
UpdatedPreds.insert(&LoopChain);
|
||||
for (MachineBasicBlock *LoopBB : L.getBlocks()) {
|
||||
|
||||
for (MachineBasicBlock *LoopBB : LoopBlockSet) {
|
||||
BlockChain &Chain = *BlockToChain[LoopBB];
|
||||
if (!UpdatedPreds.insert(&Chain).second)
|
||||
continue;
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
|
||||
|
||||
define void @foo() !prof !1 {
|
||||
; Test if a cold block in a loop will be placed at the end of the function
|
||||
; chain.
|
||||
;
|
||||
; CHECK-LABEL: foo:
|
||||
; CHECK: callq b
|
||||
; CHECK: callq c
|
||||
; CHECK: callq e
|
||||
; CHECK: callq f
|
||||
; CHECK: callq d
|
||||
|
||||
entry:
|
||||
br label %header
|
||||
|
||||
header:
|
||||
call void @b()
|
||||
%call = call zeroext i1 @a()
|
||||
br i1 %call, label %if.then, label %if.else, !prof !4
|
||||
|
||||
if.then:
|
||||
call void @c()
|
||||
br label %if.end
|
||||
|
||||
if.else:
|
||||
call void @d()
|
||||
br label %if.end
|
||||
|
||||
if.end:
|
||||
call void @e()
|
||||
%call2 = call zeroext i1 @a()
|
||||
br i1 %call2, label %header, label %end, !prof !5
|
||||
|
||||
end:
|
||||
call void @f()
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @nested_loop_0() !prof !1 {
|
||||
; Test if a block that is cold in the inner loop but not cold in the outer loop
|
||||
; will merged to the outer loop chain.
|
||||
;
|
||||
; CHECK-LABEL: nested_loop_0:
|
||||
; CHECK: callq c
|
||||
; CHECK: callq d
|
||||
; CHECK: callq e
|
||||
; CHECK: callq b
|
||||
; CHECK: callq f
|
||||
|
||||
entry:
|
||||
br label %header
|
||||
|
||||
header:
|
||||
call void @b()
|
||||
%call4 = call zeroext i1 @a()
|
||||
br i1 %call4, label %header2, label %end
|
||||
|
||||
header2:
|
||||
call void @c()
|
||||
%call = call zeroext i1 @a()
|
||||
br i1 %call, label %if.then, label %if.else, !prof !2
|
||||
|
||||
if.then:
|
||||
call void @d()
|
||||
%call3 = call zeroext i1 @a()
|
||||
br i1 %call3, label %header2, label %header, !prof !3
|
||||
|
||||
if.else:
|
||||
call void @e()
|
||||
%call2 = call zeroext i1 @a()
|
||||
br i1 %call2, label %header2, label %header, !prof !3
|
||||
|
||||
end:
|
||||
call void @f()
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @nested_loop_1() !prof !1 {
|
||||
; Test if a cold block in an inner loop will be placed at the end of the
|
||||
; function chain.
|
||||
;
|
||||
; CHECK-LABEL: nested_loop_1:
|
||||
; CHECK: callq b
|
||||
; CHECK: callq c
|
||||
; CHECK: callq e
|
||||
; CHECK: callq d
|
||||
|
||||
entry:
|
||||
br label %header
|
||||
|
||||
header:
|
||||
call void @b()
|
||||
br label %header2
|
||||
|
||||
header2:
|
||||
call void @c()
|
||||
%call = call zeroext i1 @a()
|
||||
br i1 %call, label %end, label %if.else, !prof !4
|
||||
|
||||
if.else:
|
||||
call void @d()
|
||||
%call2 = call zeroext i1 @a()
|
||||
br i1 %call2, label %header2, label %header, !prof !5
|
||||
|
||||
end:
|
||||
call void @e()
|
||||
ret void
|
||||
}
|
||||
|
||||
declare zeroext i1 @a()
|
||||
declare void @b()
|
||||
declare void @c()
|
||||
declare void @d()
|
||||
declare void @e()
|
||||
declare void @f()
|
||||
|
||||
!1 = !{!"function_entry_count", i64 1}
|
||||
!2 = !{!"branch_weights", i32 100, i32 1}
|
||||
!3 = !{!"branch_weights", i32 1, i32 10}
|
||||
!4 = !{!"branch_weights", i32 1000, i32 1}
|
||||
!5 = !{!"branch_weights", i32 100, i32 1}
|
Loading…
Reference in New Issue