[SimplifyCFG] 'Fold branch to common dest': don't overestimate the cost

`FoldBranchToCommonDest()` has a certain budget (`-bonus-inst-threshold=`)
for bonus instruction duplication. And currently it calculates the cost
as-if it will actually duplicate into each predecessor.

But ignoring the budget, it won't always duplicate into each predecessor,
there are some correctness and profitability checks.
So when calculating the cost, we should first check into which blocks
will we *actually* duplicate, and only then use that block count
to do budgeting.
This commit is contained in:
Roman Lebedev 2021-03-23 18:22:30 +03:00
parent a866f72eb2
commit b5822026dd
No known key found for this signature in database
GPG Key ID: 083C3EBB4A1689E0
2 changed files with 61 additions and 74 deletions

View File

@ -3033,8 +3033,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
BasicBlock *BB = BI->getParent();
const unsigned PredCount = pred_size(BB);
bool Changed = false;
TargetTransformInfo::TargetCostKind CostKind =
@ -3047,32 +3045,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
Cond->getParent() != BB || !Cond->hasOneUse())
return Changed;
// Only allow this transformation if computing the condition doesn't involve
// too many instructions and these involved instructions can be executed
// unconditionally. We denote all involved instructions except the condition
// as "bonus instructions", and only allow this transformation when the
// number of the bonus instructions we'll need to create when cloning into
// each predecessor does not exceed a certain threshold.
unsigned NumBonusInsts = 0;
for (Instruction &I : *BB) {
// Don't check the branch condition comparison itself.
if (&I == Cond)
continue;
// Ignore dbg intrinsics, and the terminator.
if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
continue;
// I must be safe to execute unconditionally.
if (!isSafeToSpeculativelyExecute(&I))
return Changed;
// Account for the cost of duplicating this instruction into each
// predecessor.
NumBonusInsts += PredCount;
// Early exits once we reach the limit.
if (NumBonusInsts > BonusInstThreshold)
return Changed;
}
// Cond is known to be a compare or binary operator. Check to make sure that
// neither operand is a potentially-trapping constant expression.
if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
@ -3086,6 +3058,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
if (is_contained(successors(BB), BB))
return Changed;
// With which predecessors will we want to deal with?
SmallVector<BasicBlock *, 8> Preds;
for (BasicBlock *PredBlock : predecessors(BB)) {
BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
@ -3116,6 +3090,40 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
continue;
}
// Ok, we do want to deal with this predecessor. Record it.
Preds.emplace_back(PredBlock);
}
const unsigned PredCount = Preds.size();
// Only allow this transformation if computing the condition doesn't involve
// too many instructions and these involved instructions can be executed
// unconditionally. We denote all involved instructions except the condition
// as "bonus instructions", and only allow this transformation when the
// number of the bonus instructions we'll need to create when cloning into
// each predecessor does not exceed a certain threshold.
unsigned NumBonusInsts = 0;
for (Instruction &I : *BB) {
// Don't check the branch condition comparison itself.
if (&I == Cond)
continue;
// Ignore dbg intrinsics, and the terminator.
if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
continue;
// I must be safe to execute unconditionally.
if (!isSafeToSpeculativelyExecute(&I))
return Changed;
// Account for the cost of duplicating this instruction into each
// predecessor.
NumBonusInsts += PredCount;
// Early exits once we reach the limit.
if (NumBonusInsts > BonusInstThreshold)
return Changed;
}
// Ok, we have the budget. Perform the transformation.
for (BasicBlock *PredBlock : Preds) {
auto *PBI = cast<BranchInst>(PredBlock->getTerminator());
return performBranchToCommonDestFolding(BI, PBI, DTU, MSSAU, PoisonSafe,
TTI);
}

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=THR1 %s
; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=THR2 %s
; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=ALL,THR1 %s
; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=ALL,THR2 %s
declare void @sideeffect0()
declare void @sideeffect1()
@ -82,50 +82,29 @@ final_right:
; But, we aren't going to clone it into one of the predecessors,
; because that isn't profitable. So we should not use it in cost calculation.
define void @two_preds_with_extra_op_and_branchweights(i8 %v0, i8 %v1, i8 %v2, i8 %v3) {
; THR1-LABEL: @two_preds_with_extra_op_and_branchweights(
; THR1-NEXT: entry:
; THR1-NEXT: [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
; THR1-NEXT: br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
; THR1: pred0:
; THR1-NEXT: [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
; THR1-NEXT: br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
; THR1: pred1:
; THR1-NEXT: [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
; THR1-NEXT: br i1 [[C2]], label [[DISPATCH]], label [[FINAL_RIGHT:%.*]]
; THR1: dispatch:
; THR1-NEXT: [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
; THR1-NEXT: [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
; THR1-NEXT: br i1 [[C3]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
; THR1: final_left:
; THR1-NEXT: call void @sideeffect0()
; THR1-NEXT: ret void
; THR1: final_right:
; THR1-NEXT: call void @sideeffect1()
; THR1-NEXT: ret void
;
; THR2-LABEL: @two_preds_with_extra_op_and_branchweights(
; THR2-NEXT: entry:
; THR2-NEXT: [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
; THR2-NEXT: br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
; THR2: pred0:
; THR2-NEXT: [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
; THR2-NEXT: br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
; THR2: pred1:
; THR2-NEXT: [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
; THR2-NEXT: [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
; THR2-NEXT: [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
; THR2-NEXT: [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
; THR2-NEXT: br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
; THR2: dispatch:
; THR2-NEXT: [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
; THR2-NEXT: [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
; THR2-NEXT: br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
; THR2: final_left:
; THR2-NEXT: call void @sideeffect0()
; THR2-NEXT: ret void
; THR2: final_right:
; THR2-NEXT: call void @sideeffect1()
; THR2-NEXT: ret void
; ALL-LABEL: @two_preds_with_extra_op_and_branchweights(
; ALL-NEXT: entry:
; ALL-NEXT: [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
; ALL-NEXT: br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
; ALL: pred0:
; ALL-NEXT: [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
; ALL-NEXT: br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
; ALL: pred1:
; ALL-NEXT: [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
; ALL-NEXT: [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
; ALL-NEXT: [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
; ALL-NEXT: [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
; ALL-NEXT: br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
; ALL: dispatch:
; ALL-NEXT: [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
; ALL-NEXT: [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
; ALL-NEXT: br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
; ALL: final_left:
; ALL-NEXT: call void @sideeffect0()
; ALL-NEXT: ret void
; ALL: final_right:
; ALL-NEXT: call void @sideeffect1()
; ALL-NEXT: ret void
;
entry:
%c0 = icmp eq i8 %v0, 0