[HotColdSplit] Reflect full cost of parameters in split penalty

Make the penalty for splitting a region more accurately reflect the cost
of materializing all of the inputs/outputs to/from the region.

This almost entirely eliminates code growth within functions which
undergo splitting in key internal frameworks, and reduces the size of
those frameworks between 2.6% to 3%.

rdar://49167240

Patch by: Vedant Kumar(@vsk)
Reviewers: hiraditya,rjf,t.p.northover
Reviewed By: hiraditya,rjf

Differential Revision: https://reviews.llvm.org/D59715
This commit is contained in:
Aditya Kumar 2020-12-18 08:57:38 -08:00
parent 2fced5a07b
commit 1ab4db0f84
6 changed files with 88 additions and 24 deletions

View File

@ -67,6 +67,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <limits>
#include <cassert>
#include <string>
@ -96,6 +97,10 @@ static cl::opt<std::string>
cl::desc("Name for the section containing cold functions "
"extracted by hot-cold splitting."));
static cl::opt<int> MaxParametersForSplit(
"hotcoldsplit-max-params", cl::init(4), cl::Hidden,
cl::desc("Maximum number of parameters for a split function"));
namespace {
// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
// this function unless you modify the MBB version as well.
@ -257,18 +262,6 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
if (SplittingThreshold <= 0)
return Penalty;
// The typical code size cost for materializing an argument for the outlined
// call.
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
Penalty += CostForArgMaterialization * NumInputs;
// The typical code size cost for an output alloca, its associated store, and
// its associated reload.
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
Penalty += CostForRegionOutput * NumOutputs;
// Find the number of distinct exit blocks for the region. Use a conservative
// check to determine whether control returns from the region.
bool NoBlocksReturn = true;
@ -289,6 +282,48 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
}
}
// Count the number of phis in exit blocks with >= 2 incoming values from the
// outlining region. These phis are split (\ref severSplitPHINodesOfExits),
// and new outputs are created to supply the split phis. CodeExtractor can't
// report these new outputs until extraction begins, but it's important to
// factor the cost of the outputs into the cost calculation.
unsigned NumSplitExitPhis = 0;
for (BasicBlock *ExitBB : SuccsOutsideRegion) {
for (PHINode &PN : ExitBB->phis()) {
// Find all incoming values from the outlining region.
int NumIncomingVals = 0;
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
++NumIncomingVals;
if (NumIncomingVals > 1) {
++NumSplitExitPhis;
break;
}
}
}
}
// Apply a penalty for calling the split function. Factor in the cost of
// materializing all of the parameters.
int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
int NumParams = NumInputs + NumOutputsAndSplitPhis;
if (NumParams > MaxParametersForSplit) {
LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
<< " outputs exceeds parameter limit ("
<< MaxParametersForSplit << ")\n");
return std::numeric_limits<int>::max();
}
const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
Penalty += CostForArgMaterialization * NumParams;
// Apply the typical code size cost for an output alloca and its associated
// reload in the caller. Also penalize the associated store in the callee.
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
<< " outputs/split phis\n");
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
// Apply a `noreturn` bonus.
if (NoBlocksReturn) {
LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
@ -298,7 +333,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
// Apply a penalty for having more than one successor outside of the region.
// This penalty accounts for the switch needed in the caller.
if (!SuccsOutsideRegion.empty()) {
if (SuccsOutsideRegion.size() > 1) {
LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
<< " non-region successors\n");
Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;

View File

@ -1,4 +1,4 @@
; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -disable-output %s 2>&1 | FileCheck %s
; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -hotcoldsplit-threshold=-1 -disable-output %s 2>&1 | FileCheck %s
;
; Make sure this compiles. Check that function assumption cache is refreshed
; after extracting blocks with assume calls from the function.

View File

@ -1,5 +1,5 @@
; REQUIRES: asserts
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -hotcoldsplit-max-params=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
declare void @sink(i32*, i32, i32) cold
@ -10,10 +10,27 @@ define void @foo(i32 %arg) {
br i1 undef, label %cold, label %exit
cold:
; CHECK: Applying penalty for: 2 inputs
; CHECK: Applying penalty for splitting: 2
; CHECK-NEXT: Applying penalty for: 2 params
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
; CHECK-NEXT: penalty = 6
call void @sink(i32* @g, i32 %arg, i32 %local)
ret void
exit:
ret void
}
define void @bar(i32* %p1, i32 %p2, i32 %p3) {
br i1 undef, label %cold, label %exit
cold:
; CHECK: Applying penalty for splitting: 2
; CHECK-NEXT: 3 inputs and 0 outputs exceeds parameter limit (2)
; CHECK-NEXT: penalty = 2147483647
call void @sink(i32* %p1, i32 %p2, i32 %p3)
ret void
exit:
ret void
}

View File

@ -1,5 +1,5 @@
; REQUIRES: asserts
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
declare void @sink() cold
@ -10,8 +10,10 @@ entry:
br i1 undef, label %cold, label %exit
cold:
; CHECK: Applying penalty for: 1 output
; CHECK: Applying penalty for: 1 non-region successors
; CHECK: Applying penalty for splitting: 2
; CHECK-NEXT: Applying penalty for: 1 params
; CHECK-NEXT: Applying penalty for: 1 outputs/split phis
; CHECK-NEXT: penalty = 7
%local = load i32, i32* @g
call void @sink()
br label %exit

View File

@ -1,5 +1,5 @@
; REQUIRES: asserts
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
declare void @sink() cold
@ -9,7 +9,10 @@ entry:
br i1 undef, label %cold1, label %exit
cold1:
; CHECK: Applying penalty for: 1 non-region successor
; CHECK: Applying penalty for splitting: 2
; CHECK-NEXT: Applying penalty for: 0 params
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
; CHECK-NEXT: penalty = 2
call void @sink()
br i1 undef, label %cold2, label %cold3
@ -32,7 +35,11 @@ entry:
br i1 undef, label %cold1, label %exit1
cold1:
; CHECK: Applying penalty for: 2 non-region successors
; CHECK: Applying penalty for splitting: 2
; CHECK-NEXT: Applying penalty for: 0 params
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
; CHECK-NEXT: Applying penalty for: 2 non-region successors
; CHECK-NEXT: penalty = 3
call void @sink()
br i1 undef, label %cold2, label %cold3

View File

@ -1,5 +1,5 @@
; REQUIRES: asserts
; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s
; RUN: opt -S -instsimplify -hotcoldsplit -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s
; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@ -13,7 +13,10 @@ target triple = "aarch64"
; CHECK-NOT: @llvm.assume
; CHECK: }
; CHECK: declare {{.*}}@llvm.assume
; CHECK: define {{.*}}@f.cold.1(i64 %0)
; CHECK: define {{.*}}@f.cold.1()
; CHECK-LABEL: newFuncRoot:
; CHECK: }
; CHECK: define {{.*}}@f.cold.2(i64 %0)
; CHECK-LABEL: newFuncRoot:
; CHECK: %1 = icmp eq i64 %0, 0
; CHECK-NOT: call void @llvm.assume