[FuncSpec] Support function specialization across multiple arguments.

The current implementation of Function Specialization does not allow specializing more than one arguments per function call, which is a limitation I am lifting with this patch. My main challenge was to choose the most suitable ADT for storing the specializations. We need an associative container for binding all the actual arguments of a specialization to the function call. We also need a consistent iteration order across executions. Lastly we want to be able to sort the entries by Gain and reject the least profitable ones. MapVector fits the bill but not quite; erasing elements is expensive and using stable_sort messes up the indices to the underlying vector. I am therefore using the underlying vector directly after calculating the Gain. Differential Revision: https://reviews.llvm.org/D119880
2022-03-23 14:51:16 +00:00 · 2022-03-23 14:51:16 +00:00 · 8045bf9d0d
parent 4ca111d4cb
commit 8045bf9d0d
5 changed files with 310 additions and 99 deletions
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@ -151,13 +151,14 @@ public:
  /// Return a reference to the set of argument tracked functions.
  SmallPtrSetImpl<Function *> &getArgumentTrackedFunctions();

-  /// Mark the constant argument of a new function specialization. \p F points
-  /// to the cloned function and \p Arg represents the constant argument as a
-  /// pair of {formal,actual} values (the formal argument is associated with the
-  /// original function definition). All other arguments of the specialization
-  /// inherit the lattice state of their corresponding values in the original
-  /// function.
-  void markArgInFuncSpecialization(Function *F, const ArgInfo &Arg);
+  /// Mark the constant arguments of a new function specialization. \p F points
+  /// to the cloned function and \p Args contains a list of constant arguments
+  /// represented as pairs of {formal,actual} values (the formal argument is
+  /// associated with the original function definition). All other arguments of
+  /// the specialization inherit the lattice state of their corresponding values
+  /// in the original function.
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);

  /// Mark all of the blocks in function \p F non-executable. Clients can used
  /// this method to erase a function from the module (e.g., if it has been
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@ -99,8 +99,13 @@ static cl::opt<bool> SpecializeOnAddresses(
    "func-specialization-on-address", cl::init(false), cl::Hidden,
    cl::desc("Enable function specialization on the address of global values"));

-// TODO: This needs checking to see the impact on compile-times, which is why
-// this is off by default for now.
+// Disabled by default as it can significantly increase compilation times.
+// Running nikic's compile time tracker on x86 with instruction count as the
+// metric shows 3-4% regression for SPASS while being neutral for all other
+// benchmarks of the llvm test suite.
+//
+// https://llvm-compile-time-tracker.com
+// https://github.com/nikic/llvm-compile-time-tracker
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
    "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
    cl::desc("Enable specialization of functions that take a literal constant "
@ -110,17 +115,17 @@ namespace {
 // Bookkeeping struct to pass data from the analysis and profitability phase
 // to the actual transform helper functions.
 struct SpecializationInfo {
-  ArgInfo Arg;          // Stores the {formal,actual} argument pair.
-  InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
-
-  SpecializationInfo(Argument *A, Constant *C, InstructionCost G)
-      : Arg(A, C), Gain(G){};
+  SmallVector<ArgInfo, 8> Args; // Stores the {formal,actual} argument pairs.
+  InstructionCost Gain;         // Profitability: Gain = Bonus - Cost.
 };
 } // Anonymous namespace

 using FuncList = SmallVectorImpl<Function *>;
-using ConstList = SmallVector<Constant *>;
-using SpecializationList = SmallVector<SpecializationInfo>;
+using CallArgBinding = std::pair<CallBase *, Constant *>;
+using CallSpecBinding = std::pair<CallBase *, SpecializationInfo>;
+// We are using MapVector because it guarantees deterministic iteration
+// order across executions.
+using SpecializationMap = SmallMapVector<CallBase *, SpecializationInfo, 8>;

 // Helper to check if \p LV is either a constant or a constant
 // range with a single element. This should cover exactly the same cases as the
@ -307,17 +312,15 @@ public:
      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
                        << F->getName() << " is " << Cost << "\n");

-      SpecializationList Specializations;
-      calculateGains(F, Cost, Specializations);
-      if (Specializations.empty()) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
+      SmallVector<CallSpecBinding, 8> Specializations;
+      if (!calculateGains(F, Cost, Specializations)) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: No possible constants found\n");
        continue;
      }

-      for (SpecializationInfo &S : Specializations) {
-        specializeFunction(F, S, WorkList);
-        Changed = true;
-      }
+      Changed = true;
+      for (auto &Entry : Specializations)
+        specializeFunction(F, Entry.second, WorkList);
    }

    updateSpecializedFuncs(Candidates, WorkList);
@ -392,21 +395,22 @@ private:
    return Clone;
  }

-  /// This function decides whether it's worthwhile to specialize function \p F
-  /// based on the known constant values its arguments can take on, i.e. it
-  /// calculates a gain and returns a list of actual arguments that are deemed
-  /// profitable to specialize. Specialization is performed on the first
-  /// interesting argument. Specializations based on additional arguments will
-  /// be evaluated on following iterations of the main IPSCCP solve loop.
-  void calculateGains(Function *F, InstructionCost Cost,
-                      SpecializationList &WorkList) {
+  /// This function decides whether it's worthwhile to specialize function
+  /// \p F based on the known constant values its arguments can take on. It
+  /// only discovers potential specialization opportunities without actually
+  /// applying them.
+  ///
+  /// \returns true if any specializations have been found.
+  bool calculateGains(Function *F, InstructionCost Cost,
+                      SmallVectorImpl<CallSpecBinding> &WorkList) {
+    SpecializationMap Specializations;
    // Determine if we should specialize the function based on the values the
    // argument can take on. If specialization is not profitable, we continue
    // on to the next argument.
    for (Argument &FormalArg : F->args()) {
      // Determine if this argument is interesting. If we know the argument can
      // take on any constant values, they are collected in Constants.
-      ConstList ActualArgs;
+      SmallVector<CallArgBinding, 8> ActualArgs;
      if (!isArgumentInteresting(&FormalArg, ActualArgs)) {
        LLVM_DEBUG(dbgs() << "FnSpecialization: Argument "
                          << FormalArg.getNameOrAsOperand()
@ -414,50 +418,56 @@ private:
        continue;
      }

-      for (auto *ActualArg : ActualArgs) {
-        InstructionCost Gain =
-            ForceFunctionSpecialization
-                ? 1
-                : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
+      for (const auto &Entry : ActualArgs) {
+        CallBase *Call = Entry.first;
+        Constant *ActualArg = Entry.second;

-        if (Gain <= 0)
-          continue;
-        WorkList.push_back({&FormalArg, ActualArg, Gain});
+        auto I = Specializations.insert({Call, SpecializationInfo()});
+        SpecializationInfo &S = I.first->second;
+
+        if (I.second)
+          S.Gain = ForceFunctionSpecialization ? 1 : 0 - Cost;
+        if (!ForceFunctionSpecialization)
+          S.Gain += getSpecializationBonus(&FormalArg, ActualArg);
+        S.Args.push_back({&FormalArg, ActualArg});
      }
-
-      if (WorkList.empty())
-        continue;
-
-      // Sort the candidates in descending order.
-      llvm::stable_sort(WorkList, [](const SpecializationInfo &L,
-                                     const SpecializationInfo &R) {
-        return L.Gain > R.Gain;
-      });
-
-      // Truncate the worklist to 'MaxClonesThreshold' candidates if
-      // necessary.
-      if (WorkList.size() > MaxClonesThreshold) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
-                          << "the maximum number of clones threshold.\n"
-                          << "FnSpecialization: Truncating worklist to "
-                          << MaxClonesThreshold << " candidates.\n");
-        WorkList.erase(WorkList.begin() + MaxClonesThreshold, WorkList.end());
-      }
-
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specializations for function "
-                        << F->getName() << "\n";
-                 for (SpecializationInfo &S
-                      : WorkList) {
-                   dbgs() << "FnSpecialization:   FormalArg = "
-                          << S.Arg.Formal->getNameOrAsOperand()
-                          << ", ActualArg = "
-                          << S.Arg.Actual->getNameOrAsOperand()
-                          << ", Gain = " << S.Gain << "\n";
-                 });
-
-      // FIXME: Only one argument per function.
-      break;
    }
+
+    // Remove unprofitable specializations.
+    Specializations.remove_if(
+        [](const auto &Entry) { return Entry.second.Gain <= 0; });
+
+    // Clear the MapVector and return the underlying vector.
+    WorkList = Specializations.takeVector();
+
+    // Sort the candidates in descending order.
+    llvm::stable_sort(WorkList, [](const auto &L, const auto &R) {
+      return L.second.Gain > R.second.Gain;
+    });
+
+    // Truncate the worklist to 'MaxClonesThreshold' candidates if necessary.
+    if (WorkList.size() > MaxClonesThreshold) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
+                        << "the maximum number of clones threshold.\n"
+                        << "FnSpecialization: Truncating worklist to "
+                        << MaxClonesThreshold << " candidates.\n");
+      WorkList.erase(WorkList.begin() + MaxClonesThreshold, WorkList.end());
+    }
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: Specializations for function "
+                      << F->getName() << "\n";
+               for (const auto &Entry
+                    : WorkList) {
+                 dbgs() << "FnSpecialization:   Gain = " << Entry.second.Gain
+                        << "\n";
+                 for (const ArgInfo &Arg : Entry.second.Args)
+                   dbgs() << "FnSpecialization:   FormalArg = "
+                          << Arg.Formal->getNameOrAsOperand()
+                          << ", ActualArg = "
+                          << Arg.Actual->getNameOrAsOperand() << "\n";
+               });
+
+    return !WorkList.empty();
  }

  bool isCandidateFunction(Function *F) {
@ -490,12 +500,12 @@ private:
    Function *Clone = cloneCandidateFunction(F, Mappings);

    // Rewrite calls to the function so that they call the clone instead.
-    rewriteCallSites(Clone, S.Arg, Mappings);
+    rewriteCallSites(Clone, S.Args, Mappings);

    // Initialize the lattice state of the arguments of the function clone,
    // marking the argument on which we specialized the function constant
    // with the given value.
-    Solver.markArgInFuncSpecialization(Clone, S.Arg);
+    Solver.markArgInFuncSpecialization(Clone, S.Args);

    // Mark all the specialized functions
    WorkList.push_back(Clone);
@ -641,7 +651,8 @@ private:
  ///
  /// \returns true if the function should be specialized on the given
  /// argument.
-  bool isArgumentInteresting(Argument *A, ConstList &Constants) {
+  bool isArgumentInteresting(Argument *A,
+                             SmallVectorImpl<CallArgBinding> &Constants) {
    // For now, don't attempt to specialize functions based on the values of
    // composite types.
    if (!A->getType()->isSingleValueType() || A->user_empty())
@ -681,7 +692,8 @@ private:

  /// Collect in \p Constants all the constant values that argument \p A can
  /// take on.
-  void getPossibleConstants(Argument *A, ConstList &Constants) {
+  void getPossibleConstants(Argument *A,
+                            SmallVectorImpl<CallArgBinding> &Constants) {
    Function *F = A->getParent();

    // Iterate over all the call sites of the argument's parent function.
@ -723,23 +735,24 @@ private:

      if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
                               EnableSpecializationForLiteralConstant))
-        Constants.push_back(cast<Constant>(V));
+        Constants.push_back({&CS, cast<Constant>(V)});
    }
  }

  /// Rewrite calls to function \p F to call function \p Clone instead.
  ///
  /// This function modifies calls to function \p F as long as the actual
-  /// argument matches the one in \p Arg. Note that for recursive calls we
-  /// need to compare against the cloned formal argument.
+  /// arguments match those in \p Args. Note that for recursive calls we
+  /// need to compare against the cloned formal arguments.
  ///
  /// Callsites that have been marked with the MinSize function attribute won't
  /// be specialized and rewritten.
-  void rewriteCallSites(Function *Clone, const ArgInfo &Arg,
+  void rewriteCallSites(Function *Clone, const SmallVectorImpl<ArgInfo> &Args,
                        ValueToValueMapTy &Mappings) {
-    Function *F = Arg.Formal->getParent();
-    unsigned ArgNo = Arg.Formal->getArgNo();
-    SmallVector<CallBase *, 4> CallSitesToRewrite;
+    assert(!Args.empty() && "Specialization without arguments");
+    Function *F = Args[0].Formal->getParent();
+
+    SmallVector<CallBase *, 8> CallSitesToRewrite;
    for (auto *U : F->users()) {
      if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
        continue;
@ -758,9 +771,16 @@ private:
                        << "\n");
      if (/* recursive call */
          (CS->getFunction() == Clone &&
-           CS->getArgOperand(ArgNo) == Mappings[Arg.Formal]) ||
+           all_of(Args,
+                  [CS, &Mappings](const ArgInfo &Arg) {
+                    unsigned ArgNo = Arg.Formal->getArgNo();
+                    return CS->getArgOperand(ArgNo) == Mappings[Arg.Formal];
+                  })) ||
          /* normal call */
-          CS->getArgOperand(ArgNo) == Arg.Actual) {
+          all_of(Args, [CS](const ArgInfo &Arg) {
+            unsigned ArgNo = Arg.Formal->getArgNo();
+            return CS->getArgOperand(ArgNo) == Arg.Actual;
+          })) {
        CS->setCalledFunction(Clone);
        Solver.markOverdefined(CS);
      }
@ -891,7 +911,7 @@ bool llvm::runFunctionSpecialization(
  // Initially resolve the constants in all the argument tracked functions.
  RunSCCPSolver(FuncDecls);

-  SmallVector<Function *, 2> WorkList;
+  SmallVector<Function *, 8> WorkList;
  unsigned I = 0;
  while (FuncSpecializationMaxIters != I++ &&
         FS.specializeFunctions(FuncDecls, WorkList)) {
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@ -450,7 +450,8 @@ public:
    return TrackingIncomingArguments;
  }

-  void markArgInFuncSpecialization(Function *F, const ArgInfo &Arg);
+  void markArgInFuncSpecialization(Function *F,
+                                   const SmallVectorImpl<ArgInfo> &Args);

  void markFunctionUnreachable(Function *F) {
    for (auto &BB : *F)
@ -524,21 +525,24 @@ Constant *SCCPInstVisitor::getConstant(const ValueLatticeElement &LV) const {
  return nullptr;
 }

-void SCCPInstVisitor::markArgInFuncSpecialization(Function *F,
-                                                  const ArgInfo &Arg) {
-  assert(F->arg_size() == Arg.Formal->getParent()->arg_size() &&
+void SCCPInstVisitor::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  assert(!Args.empty() && "Specialization without arguments");
+  assert(F->arg_size() == Args[0].Formal->getParent()->arg_size() &&
         "Functions should have the same number of arguments");

+  auto Iter = Args.begin();
  Argument *NewArg = F->arg_begin();
-  Argument *OldArg = Arg.Formal->getParent()->arg_begin();
+  Argument *OldArg = Args[0].Formal->getParent()->arg_begin();
  for (auto End = F->arg_end(); NewArg != End; ++NewArg, ++OldArg) {

    LLVM_DEBUG(dbgs() << "SCCP: Marking argument "
                      << NewArg->getNameOrAsOperand() << "\n");

-    if (OldArg == Arg.Formal) {
+    if (OldArg == Iter->Formal) {
      // Mark the argument constants in the new function.
-      markConstant(NewArg, Arg.Actual);
+      markConstant(NewArg, Iter->Actual);
+      ++Iter;
    } else if (ValueState.count(OldArg)) {
      // For the remaining arguments in the new function, copy the lattice state
      // over from the old function.
@ -1717,8 +1721,9 @@ SmallPtrSetImpl<Function *> &SCCPSolver::getArgumentTrackedFunctions() {
  return Visitor->getArgumentTrackedFunctions();
 }

-void SCCPSolver::markArgInFuncSpecialization(Function *F, const ArgInfo &Arg) {
-  Visitor->markArgInFuncSpecialization(F, Arg);
+void SCCPSolver::markArgInFuncSpecialization(
+    Function *F, const SmallVectorImpl<ArgInfo> &Args) {
+  Visitor->markArgInFuncSpecialization(F, Args);
 }

 void SCCPSolver::markFunctionUnreachable(Function *F) {
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
@ -46,7 +46,7 @@ entry:
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @A, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @C, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
@ -55,7 +55,7 @@ entry:
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @B, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @D, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
--- a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -function-specialization -func-specialization-max-clones=0 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=NONE
+; RUN: opt -function-specialization -func-specialization-max-clones=1 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt -function-specialization -func-specialization-max-clones=2 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=TWO
+; RUN: opt -function-specialization -func-specialization-max-clones=3 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=THREE
+
+; Make sure that we iterate correctly after sorting the specializations:
+; FnSpecialization: Specializations for function compute
+; FnSpecialization:   Gain = 608
+; FnSpecialization:   FormalArg = binop1, ActualArg = power
+; FnSpecialization:   FormalArg = binop2, ActualArg = mul
+; FnSpecialization:   Gain = 982
+; FnSpecialization:   FormalArg = binop1, ActualArg = plus
+; FnSpecialization:   FormalArg = binop2, ActualArg = minus
+; FnSpecialization:   Gain = 795
+; FnSpecialization:   FormalArg = binop1, ActualArg = minus
+; FnSpecialization:   FormalArg = binop2, ActualArg = power
+
+define i64 @main(i64 %x, i64 %y, i1 %flag) {
+; NONE-LABEL: @main(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; NONE:       plus:
+; NONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; NONE-NEXT:    br label [[MERGE:%.*]]
+; NONE:       minus:
+; NONE-NEXT:    [[TMP1:%.*]] = call i64 @compute(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; NONE-NEXT:    br label [[MERGE]]
+; NONE:       merge:
+; NONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; NONE-NEXT:    [[TMP3:%.*]] = call i64 @compute(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; NONE-NEXT:    ret i64 [[TMP3]]
+;
+; ONE-LABEL: @main(
+; ONE-NEXT:  entry:
+; ONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; ONE:       plus:
+; ONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; ONE-NEXT:    br label [[MERGE:%.*]]
+; ONE:       minus:
+; ONE-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; ONE-NEXT:    br label [[MERGE]]
+; ONE:       merge:
+; ONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; ONE-NEXT:    [[TMP3:%.*]] = call i64 @compute(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; ONE-NEXT:    ret i64 [[TMP3]]
+;
+; TWO-LABEL: @main(
+; TWO-NEXT:  entry:
+; TWO-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; TWO:       plus:
+; TWO-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; TWO-NEXT:    br label [[MERGE:%.*]]
+; TWO:       minus:
+; TWO-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; TWO-NEXT:    br label [[MERGE]]
+; TWO:       merge:
+; TWO-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; TWO-NEXT:    [[TMP3:%.*]] = call i64 @compute.2(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; TWO-NEXT:    ret i64 [[TMP3]]
+;
+; THREE-LABEL: @main(
+; THREE-NEXT:  entry:
+; THREE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; THREE:       plus:
+; THREE-NEXT:    [[TMP0:%.*]] = call i64 @compute.3(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; THREE-NEXT:    br label [[MERGE:%.*]]
+; THREE:       minus:
+; THREE-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; THREE-NEXT:    br label [[MERGE]]
+; THREE:       merge:
+; THREE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; THREE-NEXT:    [[TMP3:%.*]] = call i64 @compute.2(i64 [[TMP2]], i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; THREE-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  br i1 %flag, label %plus, label %minus
+
+plus:
+  %tmp0 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+  br label %merge
+
+minus:
+  %tmp1 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+  br label %merge
+
+merge:
+  %tmp2 = phi i64 [ %tmp0, %plus ], [ %tmp1, %minus]
+  %tmp3 = call i64 @compute(i64 %tmp2, i64 42, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+  ret i64 %tmp3
+}
+
+; THREE-NOT: define internal i64 @compute
+;
+; THREE-LABEL: define internal i64 @compute.1(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @plus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.2(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.3(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @mul(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+define internal i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+entry:
+  %tmp0 = call i64 %binop1(i64 %x, i64 %y)
+  %tmp1 = call i64 %binop2(i64 %x, i64 %y)
+  %add = add i64 %tmp0, %tmp1
+  %div = sdiv i64 %add, %x
+  %sub = sub i64 %div, %y
+  %mul = mul i64 %sub, 2
+  ret i64 %mul
+}
+
+define internal i64 @plus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = add i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @minus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @mul(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %add, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %add, %for.body ], [ %x, %entry ]
+  %add = shl nsw i64 %x.addr.07, 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define internal i64 @power(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %mul, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %mul, %for.body ], [ %x, %entry ]
+  %mul = mul nsw i64 %x.addr.07, %x.addr.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}