diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index 5ab2dd496282..fb6f0269a0ac 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -199,6 +199,7 @@ bool FlattenCFG(BasicBlock *BB, AAResults *AA = nullptr); /// branches to us and one of our successors, fold the setcc into the /// predecessor and use logical operations to pick the right destination. bool FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU = nullptr, + const TargetTransformInfo *TTI = nullptr, unsigned BonusInstThreshold = 1); /// This function takes a virtual register computed by an Instruction and diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index ae2471969160..124a7c423e72 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -143,6 +143,13 @@ MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10), cl::desc("Max size of a block which is still considered " "small enough to thread through")); +// Two is chosen to allow one negation and a logical combine. +static cl::opt + BranchFoldThreshold("simplifycfg-branch-fold-threshold", cl::Hidden, + cl::init(2), + cl::desc("Maximum cost of combining conditions when " + "folding branches")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -2684,12 +2691,16 @@ static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI, /// and one of our successors, fold the block into the predecessor and use /// logical operations to pick the right destination. bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, + const TargetTransformInfo *TTI, unsigned BonusInstThreshold) { BasicBlock *BB = BI->getParent(); const unsigned PredCount = pred_size(BB); bool Changed = false; + TargetTransformInfo::TargetCostKind CostKind = + BB->getParent()->hasMinSize() ? TargetTransformInfo::TCK_CodeSize + : TargetTransformInfo::TCK_SizeAndLatency; Instruction *Cond = nullptr; if (BI->isConditional()) @@ -2818,6 +2829,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU, continue; } + // Check the cost of inserting the necessary logic before performing the + // transformation. + if (TTI && Opc != Instruction::BinaryOpsEnd) { + Type *Ty = BI->getCondition()->getType(); + unsigned Cost = TTI->getArithmeticInstrCost(Opc, Ty, CostKind); + if (InvertPredCond && (!PBI->getCondition()->hasOneUse() || + !isa(PBI->getCondition()))) + Cost += TTI->getArithmeticInstrCost(Instruction::Xor, Ty, CostKind); + + if (Cost > BranchFoldThreshold) + continue; + } + LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); Changed = true; @@ -6013,7 +6037,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI, // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold)) + if (FoldBranchToCommonDest(BI, nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); return false; } @@ -6076,7 +6100,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold)) + if (FoldBranchToCommonDest(BI, nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); // We have a conditional branch to two blocks that are only reachable diff --git a/llvm/test/Transforms/SimplifyCFG/ARM/branch-fold-threshold.ll b/llvm/test/Transforms/SimplifyCFG/ARM/branch-fold-threshold.ll index 2bcbaff50a97..ffb13ca583f7 100644 --- a/llvm/test/Transforms/SimplifyCFG/ARM/branch-fold-threshold.ll +++ b/llvm/test/Transforms/SimplifyCFG/ARM/branch-fold-threshold.ll @@ -169,19 +169,34 @@ cond.end: } define i32 @or_predicate_minsize(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input) #0 { -; CHECK-LABEL: @or_predicate_minsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D:%.*]], 3 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] -; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[COND]] +; THUMB-LABEL: @or_predicate_minsize( +; THUMB-NEXT: entry: +; THUMB-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D:%.*]], 3 +; THUMB-NEXT: br i1 [[CMP]], label [[COND_END:%.*]], label [[LOR_LHS_FALSE:%.*]] +; THUMB: lor.lhs.false: +; THUMB-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; THUMB-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; THUMB-NEXT: br i1 [[CMP1]], label [[COND_END]], label [[COND_FALSE:%.*]] +; THUMB: cond.false: +; THUMB-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; THUMB-NEXT: br label [[COND_END]] +; THUMB: cond.end: +; THUMB-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[LOR_LHS_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; THUMB-NEXT: ret i32 [[COND]] +; +; ARM-LABEL: @or_predicate_minsize( +; ARM-NEXT: entry: +; ARM-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D:%.*]], 3 +; ARM-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; ARM-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; ARM-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] +; ARM-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] +; ARM: cond.false: +; ARM-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; ARM-NEXT: br label [[COND_END]] +; ARM: cond.end: +; ARM-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; ARM-NEXT: ret i32 [[COND]] ; entry: %cmp = icmp sgt i32 %d, 3 @@ -202,19 +217,34 @@ cond.end: } define i32 @or_invert_predicate_minsize(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input) #0 { -; CHECK-LABEL: @or_invert_predicate_minsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[D:%.*]], 3 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] -; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[COND]] +; THUMB-LABEL: @or_invert_predicate_minsize( +; THUMB-NEXT: entry: +; THUMB-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D:%.*]], 3 +; THUMB-NEXT: br i1 [[CMP]], label [[LOR_LHS_FALSE:%.*]], label [[COND_END:%.*]] +; THUMB: lor.lhs.false: +; THUMB-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; THUMB-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; THUMB-NEXT: br i1 [[CMP1]], label [[COND_END]], label [[COND_FALSE:%.*]] +; THUMB: cond.false: +; THUMB-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; THUMB-NEXT: br label [[COND_END]] +; THUMB: cond.end: +; THUMB-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[LOR_LHS_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; THUMB-NEXT: ret i32 [[COND]] +; +; ARM-LABEL: @or_invert_predicate_minsize( +; ARM-NEXT: entry: +; ARM-NEXT: [[CMP:%.*]] = icmp sle i32 [[D:%.*]], 3 +; ARM-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; ARM-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; ARM-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] +; ARM-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] +; ARM: cond.false: +; ARM-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; ARM-NEXT: br label [[COND_END]] +; ARM: cond.end: +; ARM-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; ARM-NEXT: ret i32 [[COND]] ; entry: %cmp = icmp sgt i32 %d, 3 @@ -267,19 +297,33 @@ cond.end: } define i32 @or_xor_predicate_minsize(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input, i1 %cmp) #0 { -; CHECK-LABEL: @or_xor_predicate_minsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP_NOT:%.*]] = xor i1 [[CMP:%.*]], true -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] -; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[CMP_NOT]], [[CMP1]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[COND]] +; THUMB-LABEL: @or_xor_predicate_minsize( +; THUMB-NEXT: entry: +; THUMB-NEXT: br i1 [[CMP:%.*]], label [[LOR_LHS_FALSE:%.*]], label [[COND_END:%.*]] +; THUMB: lor.lhs.false: +; THUMB-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; THUMB-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; THUMB-NEXT: br i1 [[CMP1]], label [[COND_END]], label [[COND_FALSE:%.*]] +; THUMB: cond.false: +; THUMB-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; THUMB-NEXT: br label [[COND_END]] +; THUMB: cond.end: +; THUMB-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[LOR_LHS_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; THUMB-NEXT: ret i32 [[COND]] +; +; ARM-LABEL: @or_xor_predicate_minsize( +; ARM-NEXT: entry: +; ARM-NEXT: [[CMP_NOT:%.*]] = xor i1 [[CMP:%.*]], true +; ARM-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; ARM-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; ARM-NEXT: [[OR_COND:%.*]] = or i1 [[CMP_NOT]], [[CMP1]] +; ARM-NEXT: br i1 [[OR_COND]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] +; ARM: cond.false: +; ARM-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; ARM-NEXT: br label [[COND_END]] +; ARM: cond.end: +; ARM-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; ARM-NEXT: ret i32 [[COND]] ; entry: br i1 %cmp, label %lor.lhs.false, label %cond.end @@ -331,19 +375,33 @@ cond.end: } define i32 @and_xor_minsize(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input, i1 %cmp) #0 { -; CHECK-LABEL: @and_xor_minsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP_NOT:%.*]] = xor i1 [[CMP:%.*]], true -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP_NOT]], [[CMP1]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[COND_FALSE:%.*]], label [[COND_END:%.*]] -; CHECK: cond.false: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 -; CHECK-NEXT: br label [[COND_END]] -; CHECK: cond.end: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: ret i32 [[COND]] +; THUMB-LABEL: @and_xor_minsize( +; THUMB-NEXT: entry: +; THUMB-NEXT: br i1 [[CMP:%.*]], label [[COND_END:%.*]], label [[LOR_LHS_FALSE:%.*]] +; THUMB: lor.lhs.false: +; THUMB-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; THUMB-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; THUMB-NEXT: br i1 [[CMP1]], label [[COND_FALSE:%.*]], label [[COND_END]] +; THUMB: cond.false: +; THUMB-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; THUMB-NEXT: br label [[COND_END]] +; THUMB: cond.end: +; THUMB-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[LOR_LHS_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; THUMB-NEXT: ret i32 [[COND]] +; +; ARM-LABEL: @and_xor_minsize( +; ARM-NEXT: entry: +; ARM-NEXT: [[CMP_NOT:%.*]] = xor i1 [[CMP:%.*]], true +; ARM-NEXT: [[ADD:%.*]] = add nsw i32 [[C:%.*]], [[A:%.*]] +; ARM-NEXT: [[CMP1:%.*]] = icmp slt i32 [[ADD]], [[B:%.*]] +; ARM-NEXT: [[OR_COND:%.*]] = and i1 [[CMP_NOT]], [[CMP1]] +; ARM-NEXT: br i1 [[OR_COND]], label [[COND_FALSE:%.*]], label [[COND_END:%.*]] +; ARM: cond.false: +; ARM-NEXT: [[TMP0:%.*]] = load i32, i32* [[INPUT:%.*]], align 4 +; ARM-NEXT: br label [[COND_END]] +; ARM: cond.end: +; ARM-NEXT: [[COND:%.*]] = phi i32 [ [[TMP0]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ] +; ARM-NEXT: ret i32 [[COND]] ; entry: br i1 %cmp, label %cond.end, label %lor.lhs.false