[CSSPGO] Update pseudo probe distribution factor based on inline context.

With prelink inlining, pseudo probes with same ID can come from different inline contexts. Such probes should not share samples and their factors should be fixed up separately.

I'm seeing 0.3% speedup for SPEC2017 overall. Benchmark 631.deepsjeng_s benefits the most, about 4%.

Reviewed By: wenlei, wmi

Differential Revision: https://reviews.llvm.org/D102429
This commit is contained in:
Hongtao Yu 2021-05-13 11:06:44 -07:00
parent 341902672c
commit f28ee1a2b3
4 changed files with 131 additions and 7 deletions

View File

@ -80,6 +80,9 @@ struct PseudoProbe {
uint32_t Id;
uint32_t Type;
uint32_t Attr;
// Distribution factor that estimates the portion of the real execution count.
// A saturated distribution factor stands for 1.0 or 100%. A pesudo probe has
// a factor with the value ranged from 0.0 to 1.0.
float Factor;
bool isDangling() const {

View File

@ -33,7 +33,10 @@ class Module;
using namespace sampleprof;
using BlockIdMap = std::unordered_map<BasicBlock *, uint32_t>;
using InstructionIdMap = std::unordered_map<Instruction *, uint32_t>;
using ProbeFactorMap = std::unordered_map<uint64_t, float>;
// Map from tuples of Probe id and inline stack hash code to distribution
// factors.
using ProbeFactorMap = std::unordered_map<std::pair<uint64_t, uint64_t>, float,
pair_hash<uint64_t, uint64_t>>;
using FuncProbeFactorMap = StringMap<ProbeFactorMap>;
enum class PseudoProbeReservedId { Invalid = 0, Last = Invalid };
@ -135,6 +138,18 @@ public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
// Pseudo probe distribution factor updater.
// Sample profile annotation can happen in both LTO prelink and postlink. The
// postlink-time re-annotation can degrade profile quality because of prelink
// code duplication transformation, such as loop unrolling, jump threading,
// indirect call promotion etc. As such, samples corresponding to a source
// location may be aggregated multiple times in postlink. With a concept of
// distribution factor for pseudo probes, samples can be distributed among
// duplicated probes reasonable based on the assumption that optimizations
// duplicating code well-maintain the branch frequency information (BFI). This
// pass updates distribution factors for each pseudo probe at the end of the
// prelink pipeline, to reflect an estimated portion of the real execution
// count.
class PseudoProbeUpdatePass : public PassInfoMixin<PseudoProbeUpdatePass> {
void runOnFunction(Function &F, FunctionAnalysisManager &FAM);

View File

@ -50,6 +50,27 @@ static cl::opt<bool>
UpdatePseudoProbe("update-pseudo-probe", cl::init(true), cl::Hidden,
cl::desc("Update pseudo probe distribution factor"));
static uint64_t getCallStackHash(const DILocation *DIL) {
uint64_t Hash = 0;
const DILocation *InlinedAt = DIL ? DIL->getInlinedAt() : nullptr;
while (InlinedAt) {
Hash ^= MD5Hash(std::to_string(InlinedAt->getLine()));
Hash ^= MD5Hash(std::to_string(InlinedAt->getColumn()));
const DISubprogram *SP = InlinedAt->getScope()->getSubprogram();
// Use linkage name for C++ if possible.
auto Name = SP->getLinkageName();
if (Name.empty())
Name = SP->getName();
Hash ^= MD5Hash(Name);
InlinedAt = InlinedAt->getInlinedAt();
}
return Hash;
}
static uint64_t computeCallStackHash(const Instruction &Inst) {
return getCallStackHash(Inst.getDebugLoc());
}
bool PseudoProbeVerifier::shouldVerifyFunction(const Function *F) {
// Skip function declaration.
if (F->isDeclaration())
@ -117,8 +138,10 @@ void PseudoProbeVerifier::runAfterPass(const Loop *L) {
void PseudoProbeVerifier::collectProbeFactors(const BasicBlock *Block,
ProbeFactorMap &ProbeFactors) {
for (const auto &I : *Block) {
if (Optional<PseudoProbe> Probe = extractProbe(I))
ProbeFactors[Probe->Id] += Probe->Factor;
if (Optional<PseudoProbe> Probe = extractProbe(I)) {
uint64_t Hash = computeCallStackHash(I);
ProbeFactors[{Probe->Id, Hash}] += Probe->Factor;
}
}
}
@ -136,7 +159,7 @@ void PseudoProbeVerifier::verifyProbeFactors(
dbgs() << "Function " << F->getName() << ":\n";
BannerPrinted = true;
}
dbgs() << "Probe " << I.first << "\tprevious factor "
dbgs() << "Probe " << I.first.first << "\tprevious factor "
<< format("%0.2f", PrevProbeFactor) << "\tcurrent factor "
<< format("%0.2f", CurProbeFactor) << "\n";
}
@ -407,8 +430,10 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F,
// execution count of the probe. The original samples of the probe will
// be distributed among the rest probes if there are any, this is
// less-than-deal but at least we don't lose any samples.
if (!Probe->isDangling())
ProbeFactors[Probe->Id] += BBProfileCount(&Block);
if (!Probe->isDangling()) {
uint64_t Hash = computeCallStackHash(I);
ProbeFactors[{Probe->Id, Hash}] += BBProfileCount(&Block);
}
}
}
}
@ -420,7 +445,8 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F,
// Ignore danling probes since they are logically deleted and should do
// not consume any profile samples in the subsequent profile annotation.
if (!Probe->isDangling()) {
float Sum = ProbeFactors[Probe->Id];
uint64_t Hash = computeCallStackHash(I);
float Sum = ProbeFactors[{Probe->Id, Hash}];
if (Sum != 0)
setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum);
}

View File

@ -0,0 +1,80 @@
; RUN: opt < %s -passes='pseudo-probe-update' -S | FileCheck %s
declare i32 @f1()
declare i32 @f2()
declare void @f3()
define i32 @foo(i1 %cond, i1 %cond2) !dbg !4 !prof !10 {
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
br i1 %cond, label %T1, label %Merge, !prof !11
T1: ; preds = %0
%v1 = call i32 @f1(), !prof !12
%cond3 = icmp eq i32 %v1, 412
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !13
;; Probe 7 has two copies, since they don't share the same inline context, they are not
;; considered sharing samples, thus their distribution factors are not fixed up.
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !13
;; Similar to Probe 7, one copy of Probe 8 doesn't have inline context.
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 8, i32 0, i64 -1)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !13
br i1 %cond3, label %T2, label %F2, !prof !11
Merge: ; preds = %0
%v2 = call i32 @f2(), !prof !12
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1)
;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 8513881922462547968)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 8513881922462547968), !dbg !13
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !18
; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 8, i32 0, i64 -1)
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !19
br i1 %cond2, label %T2, label %F2, !prof !11
T2: ; preds = %Merge, %T1
%B1 = phi i32 [ %v1, %T1 ], [ %v2, %Merge ]
call void @f3(), !prof !12
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1)
ret i32 %B1
F2: ; preds = %Merge, %T1
%B2 = phi i32 [ %v1, %T1 ], [ %v2, %Merge ]
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1)
ret i32 %B2
}
; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
attributes #0 = { inaccessiblememonly nounwind willreturn }
!llvm.module.flags = !{!0, !1}
!llvm.pseudo_probe_desc = !{!2, !3}
!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i64 6699318081062747564, i64 281479271677951, !"foo", null}
!3 = !{i64 6468398850841090686, i64 138828622701, !"zen", null}
!4 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 9, type: !6, scopeLine: 9, spFlags: DISPFlagDefinition, unit: !9)
!5 = !DIFile(filename: "test.cpp", directory: "test")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !5, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
!10 = !{!"function_entry_count", i64 14}
!11 = !{!"branch_weights", i32 8, i32 7}
!12 = !{!"branch_weights", i32 7}
!13 = !DILocation(line: 39, column: 9, scope: !14, inlinedAt: !16)
!14 = distinct !DILexicalBlock(scope: !15, file: !5, line: 39, column: 7)
!15 = distinct !DISubprogram(name: "zen", scope: !5, file: !5, line: 37, type: !6, scopeLine: 38, spFlags: DISPFlagDefinition, unit: !9)
!16 = distinct !DILocation(line: 10, column: 11, scope: !17)
!17 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646551)
!18 = !DILocation(line: 53, column: 3, scope: !15, inlinedAt: !19)
!19 = !DILocation(line: 12, column: 3, scope: !4)