forked from OSchip/llvm-project
[HotColdSplit] Reflect full cost of parameters in split penalty
Make the penalty for splitting a region more accurately reflect the cost of materializing all of the inputs/outputs to/from the region. This almost entirely eliminates code growth within functions which undergo splitting in key internal frameworks, and reduces the size of those frameworks between 2.6% to 3%. rdar://49167240 Patch by: Vedant Kumar(@vsk) Reviewers: hiraditya,rjf,t.p.northover Reviewed By: hiraditya,rjf Differential Revision: https://reviews.llvm.org/D59715
This commit is contained in:
parent
2fced5a07b
commit
1ab4db0f84
|
@ -67,6 +67,7 @@
|
|||
#include "llvm/Transforms/Utils/Local.h"
|
||||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
|
||||
|
@ -96,6 +97,10 @@ static cl::opt<std::string>
|
|||
cl::desc("Name for the section containing cold functions "
|
||||
"extracted by hot-cold splitting."));
|
||||
|
||||
static cl::opt<int> MaxParametersForSplit(
|
||||
"hotcoldsplit-max-params", cl::init(4), cl::Hidden,
|
||||
cl::desc("Maximum number of parameters for a split function"));
|
||||
|
||||
namespace {
|
||||
// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
|
||||
// this function unless you modify the MBB version as well.
|
||||
|
@ -257,18 +262,6 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
|
|||
if (SplittingThreshold <= 0)
|
||||
return Penalty;
|
||||
|
||||
// The typical code size cost for materializing an argument for the outlined
|
||||
// call.
|
||||
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
|
||||
const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
|
||||
Penalty += CostForArgMaterialization * NumInputs;
|
||||
|
||||
// The typical code size cost for an output alloca, its associated store, and
|
||||
// its associated reload.
|
||||
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
|
||||
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
|
||||
Penalty += CostForRegionOutput * NumOutputs;
|
||||
|
||||
// Find the number of distinct exit blocks for the region. Use a conservative
|
||||
// check to determine whether control returns from the region.
|
||||
bool NoBlocksReturn = true;
|
||||
|
@ -289,6 +282,48 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
|
|||
}
|
||||
}
|
||||
|
||||
// Count the number of phis in exit blocks with >= 2 incoming values from the
|
||||
// outlining region. These phis are split (\ref severSplitPHINodesOfExits),
|
||||
// and new outputs are created to supply the split phis. CodeExtractor can't
|
||||
// report these new outputs until extraction begins, but it's important to
|
||||
// factor the cost of the outputs into the cost calculation.
|
||||
unsigned NumSplitExitPhis = 0;
|
||||
for (BasicBlock *ExitBB : SuccsOutsideRegion) {
|
||||
for (PHINode &PN : ExitBB->phis()) {
|
||||
// Find all incoming values from the outlining region.
|
||||
int NumIncomingVals = 0;
|
||||
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
|
||||
if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
|
||||
++NumIncomingVals;
|
||||
if (NumIncomingVals > 1) {
|
||||
++NumSplitExitPhis;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply a penalty for calling the split function. Factor in the cost of
|
||||
// materializing all of the parameters.
|
||||
int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
|
||||
int NumParams = NumInputs + NumOutputsAndSplitPhis;
|
||||
if (NumParams > MaxParametersForSplit) {
|
||||
LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
|
||||
<< " outputs exceeds parameter limit ("
|
||||
<< MaxParametersForSplit << ")\n");
|
||||
return std::numeric_limits<int>::max();
|
||||
}
|
||||
const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
|
||||
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
|
||||
Penalty += CostForArgMaterialization * NumParams;
|
||||
|
||||
// Apply the typical code size cost for an output alloca and its associated
|
||||
// reload in the caller. Also penalize the associated store in the callee.
|
||||
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
|
||||
<< " outputs/split phis\n");
|
||||
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
|
||||
Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
|
||||
|
||||
// Apply a `noreturn` bonus.
|
||||
if (NoBlocksReturn) {
|
||||
LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
|
||||
|
@ -298,7 +333,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
|
|||
|
||||
// Apply a penalty for having more than one successor outside of the region.
|
||||
// This penalty accounts for the switch needed in the caller.
|
||||
if (!SuccsOutsideRegion.empty()) {
|
||||
if (SuccsOutsideRegion.size() > 1) {
|
||||
LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
|
||||
<< " non-region successors\n");
|
||||
Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -disable-output %s 2>&1 | FileCheck %s
|
||||
; RUN: opt -passes="function(slp-vectorizer),module(hotcoldsplit),function(slp-vectorizer,print<assumptions>)" -hotcoldsplit-threshold=-1 -disable-output %s 2>&1 | FileCheck %s
|
||||
;
|
||||
; Make sure this compiles. Check that function assumption cache is refreshed
|
||||
; after extracting blocks with assume calls from the function.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -hotcoldsplit-max-params=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
|
||||
declare void @sink(i32*, i32, i32) cold
|
||||
|
||||
|
@ -10,10 +10,27 @@ define void @foo(i32 %arg) {
|
|||
br i1 undef, label %cold, label %exit
|
||||
|
||||
cold:
|
||||
; CHECK: Applying penalty for: 2 inputs
|
||||
; CHECK: Applying penalty for splitting: 2
|
||||
; CHECK-NEXT: Applying penalty for: 2 params
|
||||
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
|
||||
; CHECK-NEXT: penalty = 6
|
||||
call void @sink(i32* @g, i32 %arg, i32 %local)
|
||||
ret void
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @bar(i32* %p1, i32 %p2, i32 %p3) {
|
||||
br i1 undef, label %cold, label %exit
|
||||
|
||||
cold:
|
||||
; CHECK: Applying penalty for splitting: 2
|
||||
; CHECK-NEXT: 3 inputs and 0 outputs exceeds parameter limit (2)
|
||||
; CHECK-NEXT: penalty = 2147483647
|
||||
call void @sink(i32* %p1, i32 %p2, i32 %p3)
|
||||
ret void
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
|
||||
declare void @sink() cold
|
||||
|
||||
|
@ -10,8 +10,10 @@ entry:
|
|||
br i1 undef, label %cold, label %exit
|
||||
|
||||
cold:
|
||||
; CHECK: Applying penalty for: 1 output
|
||||
; CHECK: Applying penalty for: 1 non-region successors
|
||||
; CHECK: Applying penalty for splitting: 2
|
||||
; CHECK-NEXT: Applying penalty for: 1 params
|
||||
; CHECK-NEXT: Applying penalty for: 1 outputs/split phis
|
||||
; CHECK-NEXT: penalty = 7
|
||||
%local = load i32, i32* @g
|
||||
call void @sink()
|
||||
br label %exit
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -hotcoldsplit-threshold=2 -S < %s -o /dev/null 2>&1 | FileCheck %s
|
||||
|
||||
declare void @sink() cold
|
||||
|
||||
|
@ -9,7 +9,10 @@ entry:
|
|||
br i1 undef, label %cold1, label %exit
|
||||
|
||||
cold1:
|
||||
; CHECK: Applying penalty for: 1 non-region successor
|
||||
; CHECK: Applying penalty for splitting: 2
|
||||
; CHECK-NEXT: Applying penalty for: 0 params
|
||||
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
|
||||
; CHECK-NEXT: penalty = 2
|
||||
call void @sink()
|
||||
br i1 undef, label %cold2, label %cold3
|
||||
|
||||
|
@ -32,7 +35,11 @@ entry:
|
|||
br i1 undef, label %cold1, label %exit1
|
||||
|
||||
cold1:
|
||||
; CHECK: Applying penalty for: 2 non-region successors
|
||||
; CHECK: Applying penalty for splitting: 2
|
||||
; CHECK-NEXT: Applying penalty for: 0 params
|
||||
; CHECK-NEXT: Applying penalty for: 0 outputs/split phis
|
||||
; CHECK-NEXT: Applying penalty for: 2 non-region successors
|
||||
; CHECK-NEXT: penalty = 3
|
||||
call void @sink()
|
||||
br i1 undef, label %cold2, label %cold3
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt -S -instsimplify -hotcoldsplit -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null
|
||||
|
||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||
|
@ -13,7 +13,10 @@ target triple = "aarch64"
|
|||
; CHECK-NOT: @llvm.assume
|
||||
; CHECK: }
|
||||
; CHECK: declare {{.*}}@llvm.assume
|
||||
; CHECK: define {{.*}}@f.cold.1(i64 %0)
|
||||
; CHECK: define {{.*}}@f.cold.1()
|
||||
; CHECK-LABEL: newFuncRoot:
|
||||
; CHECK: }
|
||||
; CHECK: define {{.*}}@f.cold.2(i64 %0)
|
||||
; CHECK-LABEL: newFuncRoot:
|
||||
; CHECK: %1 = icmp eq i64 %0, 0
|
||||
; CHECK-NOT: call void @llvm.assume
|
||||
|
|
Loading…
Reference in New Issue