From d49cb5b3035b02ffdd0cc8cf4c69c6e5369558f6 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 30 Aug 2021 12:14:57 -0700
Subject: [PATCH] [SimplifyCFG] Add bonus when seeing vector ops to branch fold
 to common dest

This makes some tests in vector-reductions-logical.ll more stable when
applying D108837.

The cost of branching is higher when vector ops are involved due to
potential SLP transformations.

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D108935
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 25 ++++-
 .../X86/vector-reductions-logical.ll          | 93 +++++++------------
 2 files changed, 58 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2f9eaf634519..31ef306f8b88 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -160,6 +160,13 @@ static cl::opt<unsigned>
                         cl::desc("Maximum cost of combining conditions when "
                                  "folding branches"));
 
+static cl::opt<unsigned> BranchFoldToCommonDestVectorMultiplier(
+    "simplifycfg-branch-fold-common-dest-vector-multiplier", cl::Hidden,
+    cl::init(2),
+    cl::desc("Multiplier to apply to threshold when determining whether or not "
+             "to fold branch to common destination when vector operations are "
+             "present"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -3144,6 +3151,14 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
   return true;
 }
 
+/// Return if an instruction's type or any of its operands' types are a vector
+/// type.
+static bool isVectorOp(Instruction &I) {
+  return I.getType()->isVectorTy() || any_of(I.operands(), [](Use &U) {
+           return U->getType()->isVectorTy();
+         });
+}
+
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
@@ -3228,6 +3243,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
   // number of the bonus instructions we'll need to create when cloning into
   // each predecessor does not exceed a certain threshold.
   unsigned NumBonusInsts = 0;
+  bool SawVectorOp = false;
   const unsigned PredCount = Preds.size();
   for (Instruction &I : *BB) {
     // Don't check the branch condition comparison itself.
@@ -3239,12 +3255,15 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
       return false;
+    SawVectorOp |= isVectorOp(I);
 
     // Account for the cost of duplicating this instruction into each
     // predecessor.
     NumBonusInsts += PredCount;
+
     // Early exits once we reach the limit.
-    if (NumBonusInsts > BonusInstThreshold)
+    if (NumBonusInsts >
+        BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier)
       return false;
 
     auto IsBCSSAUse = [BB, &I](Use &U) {
@@ -3258,6 +3277,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     if (!all_of(I.uses(), IsBCSSAUse))
       return false;
   }
+  if (NumBonusInsts >
+      BonusInstThreshold *
+          (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1))
+    return false;
 
   // Ok, we have the budget. Perform the transformation.
   for (BasicBlock *PredBlock : Preds) {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index 46cf58e48642..2fafc768bd96 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -90,33 +90,24 @@ return:
 define float @test_merge_anyof_v4sf(<4 x float> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4sf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[T:%.*]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[VECEXT]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
-; CHECK:       lor.lhs.false:
-; CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[T]], i32 1
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt float [[VECEXT2]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP4]], label [[RETURN]], label [[LOR_LHS_FALSE6:%.*]]
-; CHECK:       lor.lhs.false6:
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x float> [[T]], i32 2
-; CHECK-NEXT:    [[CMP9:%.*]] = fcmp olt float [[VECEXT7]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP9]], label [[RETURN]], label [[LOR_LHS_FALSE11:%.*]]
-; CHECK:       lor.lhs.false11:
-; CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x float> [[T]], i32 3
-; CHECK-NEXT:    [[CMP14:%.*]] = fcmp olt float [[VECEXT12]], 0.000000e+00
-; CHECK-NEXT:    [[CMP19:%.*]] = fcmp ogt float [[VECEXT]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP14]], i1 true, i1 [[CMP19]]
-; CHECK-NEXT:    [[CMP24:%.*]] = fcmp ogt float [[VECEXT2]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND1:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[CMP24]]
-; CHECK-NEXT:    [[CMP29:%.*]] = fcmp ogt float [[VECEXT7]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND2:%.*]] = select i1 [[OR_COND1]], i1 true, i1 [[CMP29]]
-; CHECK-NEXT:    [[CMP34:%.*]] = fcmp ogt float [[VECEXT12]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND3:%.*]] = select i1 [[OR_COND2]], i1 true, i1 [[CMP34]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT2]]
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[OR_COND3]], float 0.000000e+00, float [[ADD]]
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[LOR_LHS_FALSE6]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE11]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[T]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[T]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0
+; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x float> [[T]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0
+; CHECK-NEXT:    [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00
+; CHECK-NEXT:    [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]]
+; CHECK-NEXT:    [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00
+; CHECK-NEXT:    [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]]
+; CHECK-NEXT:    [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]]
+; CHECK-NEXT:    [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]]
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:
@@ -270,33 +261,22 @@ return:
 define float @test_separate_anyof_v4sf(<4 x float> %t) {
 ; CHECK-LABEL: @test_separate_anyof_v4sf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[T:%.*]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[VECEXT]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
-; CHECK:       lor.lhs.false:
-; CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[T]], i32 1
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt float [[VECEXT2]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP4]], label [[RETURN]], label [[LOR_LHS_FALSE6:%.*]]
-; CHECK:       lor.lhs.false6:
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x float> [[T]], i32 2
-; CHECK-NEXT:    [[CMP9:%.*]] = fcmp olt float [[VECEXT7]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP9]], label [[RETURN]], label [[LOR_LHS_FALSE11:%.*]]
-; CHECK:       lor.lhs.false11:
-; CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x float> [[T]], i32 3
-; CHECK-NEXT:    [[CMP14:%.*]] = fcmp olt float [[VECEXT12]], 0.000000e+00
-; CHECK-NEXT:    [[CMP18:%.*]] = fcmp ogt float [[VECEXT]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP14]], i1 true, i1 [[CMP18]]
-; CHECK-NEXT:    [[CMP23:%.*]] = fcmp ogt float [[VECEXT2]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND1:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[CMP23]]
-; CHECK-NEXT:    [[CMP28:%.*]] = fcmp ogt float [[VECEXT7]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND2:%.*]] = select i1 [[OR_COND1]], i1 true, i1 [[CMP28]]
-; CHECK-NEXT:    [[CMP33:%.*]] = fcmp ogt float [[VECEXT12]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND3:%.*]] = select i1 [[OR_COND2]], i1 true, i1 [[CMP33]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT2]]
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[OR_COND3]], float 0.000000e+00, float [[ADD]]
+; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[IF_END:%.*]], label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[DOTNOT7:%.*]] = icmp eq i4 [[TMP3]], 0
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]]
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[DOTNOT7]], float [[ADD]], float 0.000000e+00
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[LOR_LHS_FALSE6]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE11]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[IF_END]] ]
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:
@@ -371,20 +351,15 @@ define float @test_merge_allof_v4si(<4 x i32> %t) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i32> [[T_FR]], zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i4 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
-; CHECK:       lor.lhs.false:
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 256, i32 256, i32 256, i32 256>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i1> [[TMP3]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i4 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[RETURN]], label [[IF_END:%.*]]
-; CHECK:       if.end:
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]]
 ; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[ADD]] to float
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float [ [[CONV]], [[IF_END]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND]], float 0.000000e+00, float [[CONV]]
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry: