[HotColdSplit] Reflect full cost of parameters in split penalty

Make the penalty for splitting a region more accurately reflect the cost of materializing all of the inputs/outputs to/from the region. This almost entirely eliminates code growth within functions which undergo splitting in key internal frameworks, and reduces the size of those frameworks between 2.6% to 3%. rdar://49167240 Patch by: Vedant Kumar(@vsk) Reviewers: hiraditya,rjf,t.p.northover Reviewed By: hiraditya,rjf Differential Revision: https://reviews.llvm.org/D59715
2020-12-18 08:57:38 -08:00 · 2020-12-18 08:57:38 -08:00 · 1ab4db0f84
parent 2fced5a07b
commit 1ab4db0f84
6 changed files with 88 additions and 24 deletions
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@ -67,6 +67,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
+#include <limits>
 #include <cassert>
 #include <string>

@ -96,6 +97,10 @@ static cl::opt<std::string>
                    cl::desc("Name for the section containing cold functions "
                             "extracted by hot-cold splitting."));

+static cl::opt<int> MaxParametersForSplit(
+    "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
+    cl::desc("Maximum number of parameters for a split function"));
+
 namespace {
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
@ -257,18 +262,6 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
  if (SplittingThreshold <= 0)
    return Penalty;

-  // The typical code size cost for materializing an argument for the outlined
-  // call.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
-  const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
-  Penalty += CostForArgMaterialization * NumInputs;
-
-  // The typical code size cost for an output alloca, its associated store, and
-  // its associated reload.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
-  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
-  Penalty += CostForRegionOutput * NumOutputs;
-
  // Find the number of distinct exit blocks for the region. Use a conservative
  // check to determine whether control returns from the region.
  bool NoBlocksReturn = true;
@ -289,6 +282,48 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
    }
  }

+  // Count the number of phis in exit blocks with >= 2 incoming values from the
+  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
+  // and new outputs are created to supply the split phis. CodeExtractor can't
+  // report these new outputs until extraction begins, but it's important to
+  // factor the cost of the outputs into the cost calculation.
+  unsigned NumSplitExitPhis = 0;
+  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      int NumIncomingVals = 0;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
+          ++NumIncomingVals;
+          if (NumIncomingVals > 1) {
+            ++NumSplitExitPhis;
+            break;
+          }
+        }
+    }
+  }
+
+  // Apply a penalty for calling the split function. Factor in the cost of
+  // materializing all of the parameters.
+  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
+  int NumParams = NumInputs + NumOutputsAndSplitPhis;
+  if (NumParams > MaxParametersForSplit) {
+    LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
+                      << " outputs exceeds parameter limit ("
+                      << MaxParametersForSplit << ")\n");
+    return std::numeric_limits<int>::max();
+  }
+  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
+  Penalty += CostForArgMaterialization * NumParams;
+
+  // Apply the typical code size cost for an output alloca and its associated
+  // reload in the caller. Also penalize the associated store in the callee.
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
+                    << " outputs/split phis\n");
+  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
+  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
+
  // Apply a `noreturn` bonus.
  if (NoBlocksReturn) {
    LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
@ -298,7 +333,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,

  // Apply a penalty for having more than one successor outside of the region.
  // This penalty accounts for the switch needed in the caller.
-  if (!SuccsOutsideRegion.empty()) {
+  if (SuccsOutsideRegion.size() > 1) {
    LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
                      << " non-region successors\n");
    Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
--- a/llvm/test/Transforms/CodeExtractor/extract-assume.ll
+++ b/llvm/test/Transforms/CodeExtractor/extract-assume.ll
@ -1,4 +1,4 @@
-; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -hotcoldsplit-threshold=-1 -disable-output %s 2>&1 | FileCheck %s
 ;
 ; Make sure this compiles. Check that function assumption cache is refreshed
 ; after extracting blocks with assume calls from the function.
--- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -hotcoldsplit-max-params=2 -S < %s -o /dev/null 2>&1 | FileCheck %s

 declare void @sink(i32*, i32, i32) cold

@ -10,10 +10,27 @@ define void @foo(i32 %arg) {
  br i1 undef, label %cold, label %exit

 cold:
-  ; CHECK: Applying penalty for: 2 inputs
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 2 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: penalty = 6
  call void @sink(i32* @g, i32 %arg, i32 %local)
  ret void

 exit:
  ret void
 }
+
+define void @bar(i32* %p1, i32 %p2, i32 %p3) {
+  br i1 undef, label %cold, label %exit
+
+cold:
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: 3 inputs and 0 outputs exceeds parameter limit (2)
+  ; CHECK-NEXT: penalty = 2147483647
+  call void @sink(i32* %p1, i32 %p2, i32 %p3)
+  ret void
+
+exit:
+  ret void
+}
--- a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s

 declare void @sink() cold

@ -10,8 +10,10 @@ entry:
  br i1 undef, label %cold, label %exit

 cold:
-  ; CHECK: Applying penalty for: 1 output
-  ; CHECK: Applying penalty for: 1 non-region successors
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 1 params
+  ; CHECK-NEXT: Applying penalty for: 1 outputs/split phis
+  ; CHECK-NEXT: penalty = 7
  %local = load i32, i32* @g
  call void @sink()
  br label %exit
--- a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
+++ b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s

 declare void @sink() cold

@ -9,7 +9,10 @@ entry:
  br i1 undef, label %cold1, label %exit

 cold1:
-  ; CHECK: Applying penalty for: 1 non-region successor
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 0 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: penalty = 2
  call void @sink()
  br i1 undef, label %cold2, label %cold3

@ -32,7 +35,11 @@ entry:
  br i1 undef, label %cold1, label %exit1

 cold1:
-  ; CHECK: Applying penalty for: 2 non-region successors
+  ; CHECK: Applying penalty for splitting: 2
+  ; CHECK-NEXT: Applying penalty for: 0 params
+  ; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
+  ; CHECK-NEXT: Applying penalty for: 2 non-region successors
+  ; CHECK-NEXT: penalty = 3
  call void @sink()
  br i1 undef, label %cold2, label %cold3

--- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
+++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s
+; RUN: opt -S -instsimplify -hotcoldsplit -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s
 ; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@ -13,7 +13,10 @@ target triple = "aarch64"
 ; CHECK-NOT: @llvm.assume
 ; CHECK: }
 ; CHECK: declare {{.*}}@llvm.assume
-; CHECK: define {{.*}}@f.cold.1(i64 %0)
+; CHECK: define {{.*}}@f.cold.1()
+; CHECK-LABEL: newFuncRoot:
+; CHECK: }
+; CHECK: define {{.*}}@f.cold.2(i64 %0)
 ; CHECK-LABEL: newFuncRoot:
 ; CHECK: %1 = icmp eq i64 %0, 0
 ; CHECK-NOT: call void @llvm.assume