forked from OSchip/llvm-project
[AutoFDO] Inline replay for cold/small callees from sample profile loader
Summary: Sample profile loader of AutoFDO tries to replay previous inlining using context sensitive profile. The replay only repeats inlining if the call site block is hot. As a result it punts inlining of small functions, some of which can be beneficial for size, and will still be inlined by CSGCC inliner later. The oscillation between sample profile loader's inlining and regular CGSSC inlining cause unnecessary loss of context-sensitive profile. It doesn't have much impact for inline decision itself, but it negatively affects post-inline profile quality as CGSCC inliner have to scale counts which is not as accurate as the original context sensitive profile, and bad post-inline profile can misguide code layout. This change added regular Inline Cost calculation for sample profile loader, so we can inline small functions upfront under switch -sample-profile-inline-size. In addition -sample-profile-cold-inline-threshold is added so we can tune the separate size threshold - currently the default is chosen to be the same as regular inliner's cold call-site threshold. Reviewers: wmi, davidxl Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70750
This commit is contained in:
parent
dbd1129724
commit
7b61ae68ec
|
@ -150,6 +150,15 @@ static cl::opt<bool> ProfileTopDownLoad(
|
|||
cl::desc("Do profile annotation and inlining for functions in top-down "
|
||||
"order of call graph during sample profile loading."));
|
||||
|
||||
static cl::opt<bool> ProfileSizeInline(
|
||||
"sample-profile-inline-size", cl::Hidden, cl::init(false),
|
||||
cl::desc("Inline cold call sites in profile loader if it's beneficial "
|
||||
"for code size."));
|
||||
|
||||
static cl::opt<int> SampleColdCallSiteThreshold(
|
||||
"sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
|
||||
cl::desc("Threshold for inlining cold callsites"));
|
||||
|
||||
namespace {
|
||||
|
||||
using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
|
||||
|
@ -319,6 +328,8 @@ protected:
|
|||
bool inlineCallInstruction(Instruction *I);
|
||||
bool inlineHotFunctions(Function &F,
|
||||
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
|
||||
// Inline cold/small functions in addition to hot ones
|
||||
bool shouldInlineColdCallee(Instruction &CallInst);
|
||||
void printEdgeWeight(raw_ostream &OS, Edge E);
|
||||
void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
|
||||
void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
|
||||
|
@ -899,6 +910,21 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) {
|
||||
if (!ProfileSizeInline)
|
||||
return false;
|
||||
|
||||
Function *Callee = CallSite(&CallInst).getCalledFunction();
|
||||
if (Callee == nullptr)
|
||||
return false;
|
||||
|
||||
InlineCost Cost =
|
||||
getInlineCost(cast<CallBase>(CallInst), getInlineParams(),
|
||||
GetTTI(*Callee), GetAC, None, nullptr, nullptr);
|
||||
|
||||
return Cost.getCost() <= SampleColdCallSiteThreshold;
|
||||
}
|
||||
|
||||
/// Iteratively inline hot callsites of a function.
|
||||
///
|
||||
/// Iteratively traverse all callsites of the function \p F, and find if
|
||||
|
@ -931,20 +957,26 @@ bool SampleProfileLoader::inlineHotFunctions(
|
|||
SmallVector<Instruction *, 10> CIS;
|
||||
for (auto &BB : F) {
|
||||
bool Hot = false;
|
||||
SmallVector<Instruction *, 10> Candidates;
|
||||
SmallVector<Instruction *, 10> AllCandidates;
|
||||
SmallVector<Instruction *, 10> ColdCandidates;
|
||||
for (auto &I : BB.getInstList()) {
|
||||
const FunctionSamples *FS = nullptr;
|
||||
if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
|
||||
!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
|
||||
Candidates.push_back(&I);
|
||||
AllCandidates.push_back(&I);
|
||||
if (FS->getEntrySamples() > 0)
|
||||
localNotInlinedCallSites.try_emplace(&I, FS);
|
||||
if (callsiteIsHot(FS, PSI))
|
||||
Hot = true;
|
||||
else if (shouldInlineColdCallee(I))
|
||||
ColdCandidates.push_back(&I);
|
||||
}
|
||||
}
|
||||
if (Hot) {
|
||||
CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
|
||||
CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
|
||||
}
|
||||
else {
|
||||
CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
|
||||
}
|
||||
}
|
||||
for (auto I : CIS) {
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
main:225715:0
|
||||
2.1: 5553
|
||||
3: 5391
|
||||
3.1: _Z3sumii:0
|
||||
0: 0
|
||||
1: 0
|
||||
2: 0
|
|
@ -0,0 +1,102 @@
|
|||
; Let sample profile loader replay inlining of small/cold functions
|
||||
|
||||
; Make sure we don't inline the cold call sites by default
|
||||
; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -S | FileCheck -check-prefix=NOTINLINE %s
|
||||
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -S | FileCheck -check-prefix=NOTINLINE %s
|
||||
|
||||
; Make sure we inline code call sites for size if requested
|
||||
; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -S | FileCheck -check-prefix=INLINE %s
|
||||
|
||||
; Make sure we re-inline everything if requested
|
||||
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=9999999 -S | FileCheck -check-prefix=INLINE %s
|
||||
|
||||
; Make sure the separate size threshold for sample profile loader inlining works
|
||||
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=-500 -S | FileCheck -check-prefix=NOTINLINE %s
|
||||
|
||||
@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
|
||||
|
||||
define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !6 {
|
||||
entry:
|
||||
%x.addr = alloca i32, align 4
|
||||
%y.addr = alloca i32, align 4
|
||||
store i32 %x, i32* %x.addr, align 4
|
||||
store i32 %y, i32* %y.addr, align 4
|
||||
%tmp = load i32, i32* %x.addr, align 4, !dbg !8
|
||||
%tmp1 = load i32, i32* %y.addr, align 4, !dbg !8
|
||||
%add = add nsw i32 %tmp, %tmp1, !dbg !8
|
||||
ret i32 %add, !dbg !8
|
||||
}
|
||||
|
||||
define i32 @main() !dbg !9 {
|
||||
entry:
|
||||
%retval = alloca i32, align 4
|
||||
%s = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
store i32 0, i32* %retval
|
||||
store i32 0, i32* %i, align 4, !dbg !10
|
||||
br label %while.cond, !dbg !11
|
||||
|
||||
while.cond: ; preds = %if.end, %entry
|
||||
%tmp = load i32, i32* %i, align 4, !dbg !12
|
||||
%inc = add nsw i32 %tmp, 1, !dbg !12
|
||||
store i32 %inc, i32* %i, align 4, !dbg !12
|
||||
%cmp = icmp slt i32 %tmp, 400000000, !dbg !12
|
||||
br i1 %cmp, label %while.body, label %while.end, !dbg !12
|
||||
|
||||
while.body: ; preds = %while.cond
|
||||
%tmp1 = load i32, i32* %i, align 4, !dbg !14
|
||||
%cmp1 = icmp ne i32 %tmp1, 100, !dbg !14
|
||||
br i1 %cmp1, label %if.then, label %if.else, !dbg !14
|
||||
|
||||
if.then: ; preds = %while.body
|
||||
%tmp2 = load i32, i32* %i, align 4, !dbg !16
|
||||
%tmp3 = load i32, i32* %s, align 4, !dbg !16
|
||||
%call = call i32 @_Z3sumii(i32 %tmp2, i32 %tmp3), !dbg !16
|
||||
; INLINE-NOT: call i32 @_Z3sumii
|
||||
; NOTINLINE: call i32 @_Z3sumii
|
||||
store i32 %call, i32* %s, align 4, !dbg !16
|
||||
br label %if.end, !dbg !16
|
||||
|
||||
if.else: ; preds = %while.body
|
||||
store i32 30, i32* %s, align 4, !dbg !18
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
br label %while.cond, !dbg !20
|
||||
|
||||
while.end: ; preds = %while.cond
|
||||
%tmp4 = load i32, i32* %s, align 4, !dbg !22
|
||||
%call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %tmp4), !dbg !22
|
||||
ret i32 0, !dbg !23
|
||||
}
|
||||
|
||||
declare i32 @printf(i8*, ...)
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4}
|
||||
!llvm.ident = !{!5}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
|
||||
!1 = !DIFile(filename: "calls.cc", directory: ".")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 1, !"Debug Info Version", i32 3}
|
||||
!5 = !{!"clang version 3.5 "}
|
||||
!6 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
|
||||
!7 = !DISubroutineType(types: !2)
|
||||
!8 = !DILocation(line: 4, scope: !6)
|
||||
!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
|
||||
!10 = !DILocation(line: 8, scope: !9)
|
||||
!11 = !DILocation(line: 9, scope: !9)
|
||||
!12 = !DILocation(line: 9, scope: !13)
|
||||
!13 = !DILexicalBlockFile(scope: !9, file: !1, discriminator: 2)
|
||||
!14 = !DILocation(line: 10, scope: !15)
|
||||
!15 = distinct !DILexicalBlock(scope: !9, file: !1, line: 10)
|
||||
!16 = !DILocation(line: 10, scope: !17)
|
||||
!17 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
|
||||
!18 = !DILocation(line: 10, scope: !19)
|
||||
!19 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 4)
|
||||
!20 = !DILocation(line: 10, scope: !21)
|
||||
!21 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 6)
|
||||
!22 = !DILocation(line: 11, scope: !9)
|
||||
!23 = !DILocation(line: 12, scope: !9)
|
Loading…
Reference in New Issue