[SimplifyCFG] 'Fold branch to common dest': don't overestimate the cost

`FoldBranchToCommonDest()` has a certain budget (`-bonus-inst-threshold=`) for bonus instruction duplication. And currently it calculates the cost as-if it will actually duplicate into each predecessor. But ignoring the budget, it won't always duplicate into each predecessor, there are some correctness and profitability checks. So when calculating the cost, we should first check into which blocks will we *actually* duplicate, and only then use that block count to do budgeting.
2021-03-23 18:22:30 +03:00 · 2021-03-23 18:22:30 +03:00 · b5822026dd
parent a866f72eb2
commit b5822026dd
2 changed files with 61 additions and 74 deletions
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@ -3033,8 +3033,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,

  BasicBlock *BB = BI->getParent();

-  const unsigned PredCount = pred_size(BB);
-
  bool Changed = false;

  TargetTransformInfo::TargetCostKind CostKind =
@ -3047,32 +3045,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
      Cond->getParent() != BB || !Cond->hasOneUse())
    return Changed;

-  // Only allow this transformation if computing the condition doesn't involve
-  // too many instructions and these involved instructions can be executed
-  // unconditionally. We denote all involved instructions except the condition
-  // as "bonus instructions", and only allow this transformation when the
-  // number of the bonus instructions we'll need to create when cloning into
-  // each predecessor does not exceed a certain threshold.
-  unsigned NumBonusInsts = 0;
-  for (Instruction &I : *BB) {
-    // Don't check the branch condition comparison itself.
-    if (&I == Cond)
-      continue;
-    // Ignore dbg intrinsics, and the terminator.
-    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
-      continue;
-    // I must be safe to execute unconditionally.
-    if (!isSafeToSpeculativelyExecute(&I))
-      return Changed;
-
-    // Account for the cost of duplicating this instruction into each
-    // predecessor.
-    NumBonusInsts += PredCount;
-    // Early exits once we reach the limit.
-    if (NumBonusInsts > BonusInstThreshold)
-      return Changed;
-  }
-
  // Cond is known to be a compare or binary operator.  Check to make sure that
  // neither operand is a potentially-trapping constant expression.
  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
@ -3086,6 +3058,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
  if (is_contained(successors(BB), BB))
    return Changed;

+  // With which predecessors will we want to deal with?
+  SmallVector<BasicBlock *, 8> Preds;
  for (BasicBlock *PredBlock : predecessors(BB)) {
    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());

@ -3116,6 +3090,40 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
        continue;
    }

+    // Ok, we do want to deal with this predecessor. Record it.
+    Preds.emplace_back(PredBlock);
+  }
+
+  const unsigned PredCount = Preds.size();
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions we'll need to create when cloning into
+  // each predecessor does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
+  for (Instruction &I : *BB) {
+    // Don't check the branch condition comparison itself.
+    if (&I == Cond)
+      continue;
+    // Ignore dbg intrinsics, and the terminator.
+    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+      continue;
+    // I must be safe to execute unconditionally.
+    if (!isSafeToSpeculativelyExecute(&I))
+      return Changed;
+
+    // Account for the cost of duplicating this instruction into each
+    // predecessor.
+    NumBonusInsts += PredCount;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return Changed;
+  }
+
+  // Ok, we have the budget. Perform the transformation.
+  for (BasicBlock *PredBlock : Preds) {
+    auto *PBI = cast<BranchInst>(PredBlock->getTerminator());
    return performBranchToCommonDestFolding(BI, PBI, DTU, MSSAU, PoisonSafe,
                                            TTI);
  }
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=THR1 %s
-; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=THR2 %s
+; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=ALL,THR1 %s
+; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=ALL,THR2 %s

 declare void @sideeffect0()
 declare void @sideeffect1()
@ -82,50 +82,29 @@ final_right:
 ; But, we aren't going to clone it into one of the predecessors,
 ; because that isn't profitable. So we should not use it in cost calculation.
 define void @two_preds_with_extra_op_and_branchweights(i8 %v0, i8 %v1, i8 %v2, i8 %v3) {
-; THR1-LABEL: @two_preds_with_extra_op_and_branchweights(
-; THR1-NEXT:  entry:
-; THR1-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; THR1-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
-; THR1:       pred0:
-; THR1-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; THR1-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
-; THR1:       pred1:
-; THR1-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
-; THR1-NEXT:    br i1 [[C2]], label [[DISPATCH]], label [[FINAL_RIGHT:%.*]]
-; THR1:       dispatch:
-; THR1-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
-; THR1-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
-; THR1-NEXT:    br i1 [[C3]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
-; THR1:       final_left:
-; THR1-NEXT:    call void @sideeffect0()
-; THR1-NEXT:    ret void
-; THR1:       final_right:
-; THR1-NEXT:    call void @sideeffect1()
-; THR1-NEXT:    ret void
-;
-; THR2-LABEL: @two_preds_with_extra_op_and_branchweights(
-; THR2-NEXT:  entry:
-; THR2-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; THR2-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
-; THR2:       pred0:
-; THR2-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; THR2-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
-; THR2:       pred1:
-; THR2-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
-; THR2-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
-; THR2-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
-; THR2-NEXT:    [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
-; THR2-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
-; THR2:       dispatch:
-; THR2-NEXT:    [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
-; THR2-NEXT:    [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
-; THR2-NEXT:    br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
-; THR2:       final_left:
-; THR2-NEXT:    call void @sideeffect0()
-; THR2-NEXT:    ret void
-; THR2:       final_right:
-; THR2-NEXT:    call void @sideeffect1()
-; THR2-NEXT:    ret void
+; ALL-LABEL: @two_preds_with_extra_op_and_branchweights(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
+; ALL-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
+; ALL:       pred0:
+; ALL-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
+; ALL-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
+; ALL:       pred1:
+; ALL-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
+; ALL-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
+; ALL-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
+; ALL-NEXT:    [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
+; ALL-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
+; ALL:       dispatch:
+; ALL-NEXT:    [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
+; ALL-NEXT:    [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
+; ALL-NEXT:    br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
+; ALL:       final_left:
+; ALL-NEXT:    call void @sideeffect0()
+; ALL-NEXT:    ret void
+; ALL:       final_right:
+; ALL-NEXT:    call void @sideeffect1()
+; ALL-NEXT:    ret void
 ;
 entry:
  %c0 = icmp eq i8 %v0, 0