[PGO] Enhance pgo counter promotion

This is an incremental change to the promotion feature.

There are two problems with the current behavior:
1) loops with multiple exiting blocks are totally disabled
2) a counter update can only be promoted one level up in
  the loop nest -- which does help much for short trip
  count inner loops inside a high trip-count outer loops.

Due to this limitation, we still saw very large profile
count fluctuations from run to run for the affected loops
which are usually very hot.

This patch adds the support for promotion counters iteratively
across the loop nest. It also turns on the promotion for
loops with multiple exiting blocks (with a limit).

For single-threaded applications, the performance impact is flat
on average. For instance, dealII improves, but povray regresses.

llvm-svn: 307863
This commit is contained in:
Xinliang David Li 2017-07-12 23:27:44 +00:00
parent aa356c3cd5
commit f564c6959e
4 changed files with 284 additions and 46 deletions

View File

@ -112,7 +112,7 @@ cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
cl::desc("Do counter register promotion"),
cl::init(false));
cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(10),
cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
cl::desc("Max number counter promotions per loop to avoid"
" increasing register pressure too much"));
@ -121,10 +121,21 @@ cl::opt<int>
MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
cl::desc("Max number of allowed counter promotions"));
cl::opt<bool> SpeculativeCounterPromotion(
cl::ZeroOrMore, "speculative-counter-promotion", cl::init(false),
cl::desc("Allow counter promotion for loops with multiple exiting blocks "
" or top-tested loops. "));
cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
cl::desc("The max number of exiting blocks of a loop to allow "
" speculative counter promotion"));
cl::opt<bool> SpeculativeCounterPromotionToLoop(
cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
cl::desc("When the option is false, if the target block is in a loop, "
"the promotion will be disallowed unless the promoted counter "
" update can be further/iteratively promoted into an acyclic "
" region."));
cl::opt<bool> IterativeCounterPromotion(
cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
cl::desc("Allow counter promotion across the whole loop nest."));
class InstrProfilingLegacyPass : public ModulePass {
InstrProfiling InstrProf;
@ -150,6 +161,7 @@ public:
}
};
///
/// A helper class to promote one counter RMW operation in the loop
/// into register update.
///
@ -158,16 +170,19 @@ public:
///
class PGOCounterPromoterHelper : public LoadAndStorePromoter {
public:
PGOCounterPromoterHelper(Instruction *L, Instruction *S, SSAUpdater &SSA,
Value *Init, BasicBlock *PH,
ArrayRef<BasicBlock *> ExitBlocks,
ArrayRef<Instruction *> InsertPts)
PGOCounterPromoterHelper(
Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
ArrayRef<Instruction *> InsertPts,
DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
LoopInfo &LI)
: LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
InsertPts(InsertPts) {
InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
assert(isa<LoadInst>(L));
assert(isa<StoreInst>(S));
SSA.AddAvailableValue(PH, Init);
}
void doExtraRewritesBeforeFinalDeletion() const override {
for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
BasicBlock *ExitBlock = ExitBlocks[i];
@ -179,12 +194,21 @@ public:
Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
IRBuilder<> Builder(InsertPos);
if (AtomicCounterUpdatePromoted)
// automic update currently can only be promoted across the current
// loop, not the whole loop nest.
Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
AtomicOrdering::SequentiallyConsistent);
else {
LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted");
auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
Builder.CreateStore(NewVal, Addr);
auto *NewStore = Builder.CreateStore(NewVal, Addr);
// Now update the parent loop's candidate list:
if (IterativeCounterPromotion) {
auto *TargetLoop = LI.getLoopFor(ExitBlock);
if (TargetLoop)
LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
}
}
}
}
@ -193,6 +217,8 @@ private:
Instruction *Store;
ArrayRef<BasicBlock *> ExitBlocks;
ArrayRef<Instruction *> InsertPts;
DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
LoopInfo &LI;
};
/// A helper class to do register promotion for all profile counter
@ -200,12 +226,15 @@ private:
///
class PGOCounterPromoter {
public:
PGOCounterPromoter(ArrayRef<LoadStorePair> Cands, Loop &Loop)
: Candidates(Cands), ExitBlocks(), InsertPts(), ParentLoop(Loop) {
PGOCounterPromoter(
DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
Loop &CurLoop, LoopInfo &LI)
: LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
LI(LI) {
SmallVector<BasicBlock *, 8> LoopExitBlocks;
SmallPtrSet<BasicBlock *, 8> BlockSet;
ParentLoop.getExitBlocks(LoopExitBlocks);
L.getExitBlocks(LoopExitBlocks);
for (BasicBlock *ExitBlock : LoopExitBlocks) {
if (BlockSet.insert(ExitBlock).second) {
@ -216,55 +245,97 @@ public:
}
bool run(int64_t *NumPromoted) {
// We can't insert into a catchswitch.
bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
return isa<CatchSwitchInst>(Exit->getTerminator());
});
if (HasCatchSwitch)
return false;
if (!ParentLoop.hasDedicatedExits())
return false;
BasicBlock *PH = ParentLoop.getLoopPreheader();
if (!PH)
return false;
BasicBlock *H = ParentLoop.getHeader();
bool TopTested =
((ParentLoop.getBlocks().size() > 1) && ParentLoop.isLoopExiting(H));
if (!SpeculativeCounterPromotion &&
(TopTested || ParentLoop.getExitingBlock() == nullptr))
unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
if (MaxProm == 0)
return false;
unsigned Promoted = 0;
for (auto &Cand : Candidates) {
for (auto &Cand : LoopToCandidates[&L]) {
SmallVector<PHINode *, 4> NewPHIs;
SSAUpdater SSA(&NewPHIs);
Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
PH, ExitBlocks, InsertPts);
L.getLoopPreheader(), ExitBlocks,
InsertPts, LoopToCandidates, LI);
Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
Promoted++;
if (Promoted >= MaxNumOfPromotionsPerLoop)
if (Promoted >= MaxProm)
break;
(*NumPromoted)++;
if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
break;
}
DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
<< ParentLoop.getLoopDepth() << ")\n");
<< L.getLoopDepth() << ")\n");
return Promoted != 0;
}
private:
ArrayRef<LoadStorePair> Candidates;
bool allowSpeculativeCounterPromotion(Loop *LP) {
SmallVector<BasicBlock *, 8> ExitingBlocks;
L.getExitingBlocks(ExitingBlocks);
// Not considierered speculative.
if (ExitingBlocks.size() == 1)
return true;
if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
return false;
return true;
}
// Returns the max number of Counter Promotions for LP.
unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
// We can't insert into a catchswitch.
SmallVector<BasicBlock *, 8> LoopExitBlocks;
LP->getExitBlocks(LoopExitBlocks);
if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
return isa<CatchSwitchInst>(Exit->getTerminator());
}))
return 0;
if (!LP->hasDedicatedExits())
return 0;
BasicBlock *PH = LP->getLoopPreheader();
if (!PH)
return 0;
SmallVector<BasicBlock *, 8> ExitingBlocks;
LP->getExitingBlocks(ExitingBlocks);
// Not considierered speculative.
if (ExitingBlocks.size() == 1)
return MaxNumOfPromotionsPerLoop;
if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
return 0;
// Whether the target block is in a loop does not matter:
if (SpeculativeCounterPromotionToLoop)
return MaxNumOfPromotionsPerLoop;
// Now check the target block:
unsigned MaxProm = MaxNumOfPromotionsPerLoop;
for (auto *TargetBlock : LoopExitBlocks) {
auto *TargetLoop = LI.getLoopFor(TargetBlock);
if (!TargetLoop)
continue;
unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
MaxProm =
std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
PendingCandsInTarget);
}
return MaxProm;
}
DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
SmallVector<BasicBlock *, 8> ExitBlocks;
SmallVector<Instruction *, 8> InsertPts;
Loop &ParentLoop;
Loop &L;
LoopInfo &LI;
};
} // end anonymous namespace
@ -349,8 +420,10 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
for (auto *Loop : Loops) {
PGOCounterPromoter Promoter(LoopPromotionCandidates[Loop], *Loop);
// Do a post-order traversal of the loops so that counter updates can be
// iteratively hoisted outside the loop nest.
for (auto *Loop : llvm::reverse(Loops)) {
PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI);
Promoter.run(&TotalCountersPromoted);
}
}

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
$__llvm_profile_raw_version = comdat any

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
@g = common local_unnamed_addr global i32 0, align 4

View File

@ -0,0 +1,165 @@
; TEST that counter updates are promoted outside the whole loop nest
; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
@g = common local_unnamed_addr global i32 0, align 4
@c = local_unnamed_addr global i32 10, align 4
; Function Attrs: noinline norecurse nounwind uwtable
define void @bar() local_unnamed_addr #0 {
bb:
%tmp2 = load i32, i32* @g, align 4, !tbaa !2
%tmp3 = add nsw i32 %tmp2, 1
store i32 %tmp3, i32* @g, align 4, !tbaa !2
ret void
}
; Function Attrs: norecurse nounwind uwtable
define i32 @main() local_unnamed_addr #1 {
bb:
store i32 0, i32* @g, align 4, !tbaa !2
%tmp = load i32, i32* @c, align 4, !tbaa !2
%tmp1 = icmp sgt i32 %tmp, 0
br i1 %tmp1, label %bb2_1, label %bb84
bb2_1:
br label %bb2
bb2: ; preds = %bb39, %bb
%tmp3 = phi i32 [ %tmp40, %bb39 ], [ %tmp, %bb2_1 ]
%tmp5 = phi i32 [ %tmp43, %bb39 ], [ 0, %bb2_1 ]
%tmp7 = icmp sgt i32 %tmp3, 0
br i1 %tmp7, label %bb14_1, label %bb39
bb8: ; preds = %bb39
; PROMO-LABEL: bb8
; PROMO: load {{.*}} @__profc_main{{.*}}
; PROMO-NEXT: add
; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
; PROMO-NEXT: add
; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
; PROMO-NEXT: add
; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
; PROMO-NEXT: add
; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
; PROMO-NEXT: add
; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
%tmp13 = icmp sgt i32 %tmp40, 0
br i1 %tmp13, label %bb45, label %bb84
bb14_1:
br label %bb14
bb14: ; preds = %bb29, %bb2
%tmp15 = phi i32 [ %tmp30, %bb29 ], [ %tmp3, %bb14_1 ]
%tmp16 = phi i64 [ %tmp31, %bb29 ], [ 0, %bb14_1 ]
%tmp17 = phi i64 [ %tmp32, %bb29 ], [ 0, %bb14_1 ]
%tmp18 = phi i32 [ %tmp33, %bb29 ], [ 0, %bb14_1 ]
%tmp19 = icmp sgt i32 %tmp15, 0
br i1 %tmp19, label %bb20_split, label %bb29
bb20_split:
br label %bb20
bb20: ; preds = %bb20, %bb14
%tmp21 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb20_split ]
%tmp22 = phi i32 [ %tmp24, %bb20 ], [ 0, %bb20_split ]
%tmp23 = add nuw i64 %tmp21, 1
tail call void @bar()
%tmp24 = add nuw nsw i32 %tmp22, 1
%tmp25 = load i32, i32* @c, align 4, !tbaa !2
%tmp26 = icmp slt i32 %tmp24, %tmp25
br i1 %tmp26, label %bb20, label %bb27
bb27: ; preds = %bb20
%tmp28 = add i64 %tmp23, %tmp16
br label %bb29
bb29: ; preds = %bb27, %bb14
%tmp30 = phi i32 [ %tmp25, %bb27 ], [ %tmp15, %bb14 ]
%tmp31 = phi i64 [ %tmp28, %bb27 ], [ %tmp16, %bb14 ]
%tmp32 = add nuw i64 %tmp17, 1
%tmp33 = add nuw nsw i32 %tmp18, 1
%tmp34 = icmp slt i32 %tmp33, %tmp30
br i1 %tmp34, label %bb14, label %bb35
bb35: ; preds = %bb29
%tmp36 = insertelement <2 x i64> undef, i64 %tmp31, i32 0
br label %bb39
bb39: ; preds = %bb35, %bb2
%tmp40 = phi i32 [ %tmp30, %bb35 ], [ %tmp3, %bb2 ]
%tmp43 = add nuw nsw i32 %tmp5, 1
%tmp44 = icmp slt i32 %tmp43, %tmp40
br i1 %tmp44, label %bb2, label %bb8
bb45: ; preds = %bb67, %bb8
%tmp46 = phi i32 [ %tmp68, %bb67 ], [ %tmp40, %bb8 ]
%tmp47 = phi i64 [ %tmp69, %bb67 ], [ 0, %bb8 ]
%tmp48 = phi i64 [ %tmp70, %bb67 ], [ 0, %bb8 ]
%tmp49 = phi i32 [ %tmp71, %bb67 ], [ 0, %bb8 ]
%tmp50 = icmp sgt i32 %tmp46, 0
br i1 %tmp50, label %bb57, label %bb67
bb51: ; preds = %bb67
%tmp56 = icmp sgt i32 %tmp68, 0
br i1 %tmp56, label %bb73, label %bb84
bb57: ; preds = %bb57, %bb45
%tmp58 = phi i64 [ %tmp60, %bb57 ], [ 0, %bb45 ]
%tmp59 = phi i32 [ %tmp61, %bb57 ], [ 0, %bb45 ]
%tmp60 = add nuw i64 %tmp58, 1
tail call void @bar()
%tmp61 = add nuw nsw i32 %tmp59, 1
%tmp62 = load i32, i32* @c, align 4, !tbaa !2
%tmp63 = mul nsw i32 %tmp62, 10
%tmp64 = icmp slt i32 %tmp61, %tmp63
br i1 %tmp64, label %bb57, label %bb65
bb65: ; preds = %bb57
%tmp66 = add i64 %tmp60, %tmp47
br label %bb67
bb67: ; preds = %bb65, %bb45
%tmp68 = phi i32 [ %tmp62, %bb65 ], [ %tmp46, %bb45 ]
%tmp69 = phi i64 [ %tmp66, %bb65 ], [ %tmp47, %bb45 ]
%tmp70 = add nuw i64 %tmp48, 1
%tmp71 = add nuw nsw i32 %tmp49, 1
%tmp72 = icmp slt i32 %tmp71, %tmp68
br i1 %tmp72, label %bb45, label %bb51
bb73: ; preds = %bb73, %bb51
%tmp74 = phi i64 [ %tmp76, %bb73 ], [ 0, %bb51 ]
%tmp75 = phi i32 [ %tmp77, %bb73 ], [ 0, %bb51 ]
%tmp76 = add nuw i64 %tmp74, 1
tail call void @bar()
%tmp77 = add nuw nsw i32 %tmp75, 1
%tmp78 = load i32, i32* @c, align 4, !tbaa !2
%tmp79 = mul nsw i32 %tmp78, 100
%tmp80 = icmp slt i32 %tmp77, %tmp79
br i1 %tmp80, label %bb73, label %bb81
bb81: ; preds = %bb73
br label %bb84
bb84: ; preds = %bb81, %bb51, %bb8, %bb
ret i32 0
}
attributes #0 = { noinline }
attributes #1 = { norecurse nounwind uwtable }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 5.0.0 (trunk 307355)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}