From d49cb5b3035b02ffdd0cc8cf4c69c6e5369558f6 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 30 Aug 2021 12:14:57 -0700 Subject: [PATCH] [SimplifyCFG] Add bonus when seeing vector ops to branch fold to common dest This makes some tests in vector-reductions-logical.ll more stable when applying D108837. The cost of branching is higher when vector ops are involved due to potential SLP transformations. Reviewed By: spatel Differential Revision: https://reviews.llvm.org/D108935 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 25 ++++- .../X86/vector-reductions-logical.ll | 93 +++++++------------ 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 2f9eaf634519..31ef306f8b88 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -160,6 +160,13 @@ static cl::opt cl::desc("Maximum cost of combining conditions when " "folding branches")); +static cl::opt BranchFoldToCommonDestVectorMultiplier( + "simplifycfg-branch-fold-common-dest-vector-multiplier", cl::Hidden, + cl::init(2), + cl::desc("Multiplier to apply to threshold when determining whether or not " + "to fold branch to common destination when vector operations are " + "present")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -3144,6 +3151,14 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, return true; } +/// Return if an instruction's type or any of its operands' types are a vector +/// type. +static bool isVectorOp(Instruction &I) { + return I.getType()->isVectorTy() || any_of(I.operands(), [](Use &U) { + return U->getType()->isVectorTy(); + }); +} + /// If this basic block is simple enough, and if a predecessor branches to us /// and one of our successors, fold the block into the predecessor and use /// logical operations to pick the right destination. @@ -3228,6 +3243,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, // number of the bonus instructions we'll need to create when cloning into // each predecessor does not exceed a certain threshold. unsigned NumBonusInsts = 0; + bool SawVectorOp = false; const unsigned PredCount = Preds.size(); for (Instruction &I : *BB) { // Don't check the branch condition comparison itself. @@ -3239,12 +3255,15 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, // I must be safe to execute unconditionally. if (!isSafeToSpeculativelyExecute(&I)) return false; + SawVectorOp |= isVectorOp(I); // Account for the cost of duplicating this instruction into each // predecessor. NumBonusInsts += PredCount; + // Early exits once we reach the limit. - if (NumBonusInsts > BonusInstThreshold) + if (NumBonusInsts > + BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) return false; auto IsBCSSAUse = [BB, &I](Use &U) { @@ -3258,6 +3277,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, if (!all_of(I.uses(), IsBCSSAUse)) return false; } + if (NumBonusInsts > + BonusInstThreshold * + (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) + return false; // Ok, we have the budget. Perform the transformation. for (BasicBlock *PredBlock : Preds) { diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 46cf58e48642..2fafc768bd96 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -90,33 +90,24 @@ return: define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[T:%.*]], i32 0 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[VECEXT]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: -; CHECK-NEXT: [[VECEXT2:%.*]] = extractelement <4 x float> [[T]], i32 1 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt float [[VECEXT2]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP4]], label [[RETURN]], label [[LOR_LHS_FALSE6:%.*]] -; CHECK: lor.lhs.false6: -; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x float> [[T]], i32 2 -; CHECK-NEXT: [[CMP9:%.*]] = fcmp olt float [[VECEXT7]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP9]], label [[RETURN]], label [[LOR_LHS_FALSE11:%.*]] -; CHECK: lor.lhs.false11: -; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <4 x float> [[T]], i32 3 -; CHECK-NEXT: [[CMP14:%.*]] = fcmp olt float [[VECEXT12]], 0.000000e+00 -; CHECK-NEXT: [[CMP19:%.*]] = fcmp ogt float [[VECEXT]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP14]], i1 true, i1 [[CMP19]] -; CHECK-NEXT: [[CMP24:%.*]] = fcmp ogt float [[VECEXT2]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[CMP24]] -; CHECK-NEXT: [[CMP29:%.*]] = fcmp ogt float [[VECEXT7]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND2:%.*]] = select i1 [[OR_COND1]], i1 true, i1 [[CMP29]] -; CHECK-NEXT: [[CMP34:%.*]] = fcmp ogt float [[VECEXT12]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[OR_COND2]], i1 true, i1 [[CMP34]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT2]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[OR_COND3]], float 0.000000e+00, float [[ADD]] -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[LOR_LHS_FALSE6]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE11]] ] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[T]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[T]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0 +; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 +; CHECK-NEXT: [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]] +; CHECK-NEXT: [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]] +; CHECK-NEXT: [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]] +; CHECK-NEXT: [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -270,33 +261,22 @@ return: define float @test_separate_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_separate_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[T:%.*]], i32 0 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[VECEXT]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: -; CHECK-NEXT: [[VECEXT2:%.*]] = extractelement <4 x float> [[T]], i32 1 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt float [[VECEXT2]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP4]], label [[RETURN]], label [[LOR_LHS_FALSE6:%.*]] -; CHECK: lor.lhs.false6: -; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x float> [[T]], i32 2 -; CHECK-NEXT: [[CMP9:%.*]] = fcmp olt float [[VECEXT7]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP9]], label [[RETURN]], label [[LOR_LHS_FALSE11:%.*]] -; CHECK: lor.lhs.false11: -; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <4 x float> [[T]], i32 3 -; CHECK-NEXT: [[CMP14:%.*]] = fcmp olt float [[VECEXT12]], 0.000000e+00 -; CHECK-NEXT: [[CMP18:%.*]] = fcmp ogt float [[VECEXT]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP14]], i1 true, i1 [[CMP18]] -; CHECK-NEXT: [[CMP23:%.*]] = fcmp ogt float [[VECEXT2]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[CMP23]] -; CHECK-NEXT: [[CMP28:%.*]] = fcmp ogt float [[VECEXT7]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND2:%.*]] = select i1 [[OR_COND1]], i1 true, i1 [[CMP28]] -; CHECK-NEXT: [[CMP33:%.*]] = fcmp ogt float [[VECEXT12]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[OR_COND2]], i1 true, i1 [[CMP33]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT2]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[OR_COND3]], float 0.000000e+00, float [[ADD]] +; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[DOTNOT]], label [[IF_END:%.*]], label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[T_FR]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 +; CHECK-NEXT: [[DOTNOT7:%.*]] = icmp eq i4 [[TMP3]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[DOTNOT7]], float [[ADD]], float 0.000000e+00 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[LOR_LHS_FALSE6]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE11]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[IF_END]] ] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -371,20 +351,15 @@ define float @test_merge_allof_v4si(<4 x i32> %t) { ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i32> [[T_FR]], zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i4 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[T_FR]], ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i1> [[TMP3]] to i4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i4 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[RETURN]], label [[IF_END:%.*]] -; CHECK: if.end: +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[TMP2]], [[TMP5]] ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]] ; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ [[CONV]], [[IF_END]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND]], float 0.000000e+00, float [[CONV]] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: