[CSSPGO][llvm-profgen] Fix external address issues of perf reader (leading external LBR part)

We can have the sampling just hit into the external addresses, in that case, both the top stack frame and the latest LBR target are external addresses. For example:
```
	        ffffffff
 0x4006c8/0xffffffff/P/-/-/0  0x40069b/0x400670/M/-/-/0

 	          ffffffff
	          40067e
0xffffffff/0xffffffff/P/-/-/0  0x4006c8/0xffffffff/P/-/-/0  0x40069b/0x400670/M/-/-/0
```
Before we will ignore the entire samples. However, we found there exists some internal LBRs in the remaining part of sample, the range between them is still a valid range, we will lose some valid LBRs. Those LBRs will be unwinded based on a empty(context-less) call stack.

This change tries to fix it, instead of ignoring the entire sample, we only ignore the leading external addresses.

Note that the first outgoing LBR is useful since there is a valid range between it's source and next LBR's target.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D115538
This commit is contained in:
wlei 2021-12-14 16:28:36 -08:00
parent 3220571793
commit 3dcb60db9a
7 changed files with 165 additions and 44 deletions

View File

@ -0,0 +1,39 @@
PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
; Test for an external top address, should only ignore the call stack and keep unwinding the LBR
; Valid LBR. The first 4006d7 will be kept for unwinding, the second will be truncated.
ffffffff
ffffffff
4006d7
ffffffff
4006d7
ffffffff
0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0
; Valid LBR
ffffffff
0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0
; Valid LBR
ffffffff
0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
; Valid LBR
40067e
0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
; Valid LBR
ffffffff
5541f689495641d7
0xffffffff/0xffffffff/P/-/-/0 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0
; Empty sample
ffffffff
5541f689495641d7
0xffffffff/0xffffffff/P/-/-/0 0xffffffff/0xffffffff/P/-/-/0
; Invalid LBR
ffffffff
0xffffffff/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0

View File

@ -1,12 +1,5 @@
PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
; test for an external or invalid top address, should skip the whole sample
ffffffff
40067e
5541f689495641d7
0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0
40067e
5541f689495641d7
0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0

View File

@ -0,0 +1,28 @@
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/external-address.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-hot-count=0 --compress-recursion=0
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
; CHECK-UNWINDER: [main:1 @ foo]
; CHECK-UNWINDER: 2
; CHECK-UNWINDER: 670-6ad:4
; CHECK-UNWINDER: 6bd-6c8:4
; CHECK-UNWINDER: 2
; CHECK-UNWINDER: 69b->670:5
; CHECK-UNWINDER: 6c8->67e:1
; CHECK-UNWINDER: [main:1 @ foo:3.1 @ bar]
; CHECK-UNWINDER: 1
; CHECK-UNWINDER: 6af-6bb:4
; CHECK-UNWINDER: 0
; Manually created to test if remaining call stack can be correctly unwinded.
; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo]
; CHECK-UNWINDER: 2
; CHECK-UNWINDER: 670-6ad:1
; CHECK-UNWINDER: 6bd-6c8:1
; CHECK-UNWINDER: 2
; CHECK-UNWINDER: 69b->670:1
; CHECK-UNWINDER: 6c8->67e:1
; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo:3.1 @ bar]
; CHECK-UNWINDER: 1
; CHECK-UNWINDER: 6af-6bb:1
; CHECK-UNWINDER: 0

View File

@ -3,7 +3,6 @@
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-cold-count=0
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --profile-summary-cold-count=0
>>>>>>> 02ea7084c370 ([llvm-profgen] Support LBR only perf script)
; RUN: FileCheck %s --input-file %t
; CHECK:[main:1 @ foo]:88:0

View File

@ -73,22 +73,30 @@
;CHECK: 1: 5
;CHECK: 2: 5
;CHECK: 3: 5
;CHECK: main:486:0
;CHECK: main:906:0
;CHECK: 0: 0
;CHECK: 3: 0
;CHECK: 4.1: 0
;CHECK: 4.3: 0
;CHECK: 5.1: 6
;CHECK: 5.3: 6
;CHECK: 6: 6
;CHECK: 6.1: 6
;CHECK: 6.3: 6
;CHECK: 5.1: 11
;CHECK: 5.3: 11
;CHECK: 6: 11
;CHECK: 6.1: 14
;CHECK: 6.3: 11
;CHECK: 7: 0
;CHECK: 8: 0 quick_sort:1
;CHECK: 9: 0
;CHECK: 11: 0
;CHECK: 14: 0
;CHECK: 65499: 0
;CHECK: quick_sort:903:25
;CHECK: 1: 24
;CHECK: 2: 12 partition_pivot_last:7 partition_pivot_first:5
;CHECK: 3: 11 quick_sort:12
;CHECK: 4: 12 quick_sort:12
;CHECK: 6: 24
;CHECK: 65507: 12
; original code:

View File

@ -59,6 +59,7 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
// pro/epi tracker(Dwarf CFI) for the precise check.
uint64_t Source = State.getCurrentLBRSource();
auto *ParentFrame = State.getParentFrame();
if (ParentFrame == State.getDummyRootPtr() ||
ParentFrame->Address != Source) {
State.switchToFrame(Source);
@ -121,7 +122,7 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) {
State.InstPtr.update(LBR.Source);
}
void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
void VirtualUnwinder::unwindBranch(UnwindState &State) {
// TODO: Tolerate tail call for now, as we may see tail call from libraries.
// This is only for intra function branches, excluding tail calls.
uint64_t Source = State.getCurrentLBRSource();
@ -219,7 +220,7 @@ void VirtualUnwinder::collectSamplesFromFrameTrie(
void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
UnwindState &State, uint64_t Repeat) {
if (Branch.IsArtificial)
if (Branch.IsArtificial || Branch.Target == ExternalAddr)
return;
if (Binary->usePseudoProbes()) {
@ -242,21 +243,18 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) {
if (!State.validateInitialState())
return false;
// Also do not attempt linear unwind for the leaf range as it's incomplete.
bool IsLeaf = true;
// Now process the LBR samples in parrallel with stack sample
// Note that we do not reverse the LBR entry order so we can
// unwind the sample stack as we walk through LBR entries.
while (State.hasNextLBR()) {
State.checkStateConsistency();
// Do not attempt linear unwind for the leaf range as it's incomplete.
if (!State.IsLastLBR()) {
// Unwind implicit calls/returns from inlining, along the linear path,
// break into smaller sub section each with its own calling context.
if (!IsLeaf) {
unwindLinear(State, Repeat);
}
IsLeaf = false;
// Save the LBR branch before it gets unwound.
const LBREntry &Branch = State.getCurrentLBR();
@ -271,9 +269,15 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) {
// Unwind returns - check whether the IP is indeed at a return instruction
unwindReturn(State);
} else {
// Unwind branches - for regular intra function branches, we only
// need to record branch with context.
unwindBranchWithinFrame(State);
// Unwind branches
// For regular intra function branches, we only need to record branch with
// context. For an artificial branch cross function boundaries, we got an
// issue with returning to external code. Take the two LBR enties for
// example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader,
// we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo
// return to bar. Here we detect and treat this case as BRANCH instead of
// RETURN which only update the source address.
unwindBranch(State);
}
State.advanceLBR();
// Record `branch` with calling context after unwinding.
@ -432,9 +436,9 @@ void HybridPerfReader::unwindSamples() {
if (Binary->useFSDiscriminator())
exitWithError("FS discriminator is not supported in CS profile.");
std::set<uint64_t> AllUntrackedCallsites;
VirtualUnwinder Unwinder(&SampleCounters, Binary);
for (const auto &Item : AggregatedSamples) {
const PerfSample *Sample = Item.first.getPtr();
VirtualUnwinder Unwinder(&SampleCounters, Binary);
Unwinder.unwind(Sample, Item.second);
auto &CurrUntrackedCallsites = Unwinder.getUntrackedCallsites();
AllUntrackedCallsites.insert(CurrUntrackedCallsites.begin(),
@ -508,26 +512,32 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
bool IsOutgoing = SrcIsInternal && !DstIsInternal;
bool IsArtificial = false;
// Ignore branches outside the current binary. Ignore all remaining branches
// if there's no incoming branch before the external branch in reverse
// order.
// Ignore branches outside the current binary.
if (IsExternal) {
if (PrevTrDst)
continue;
if (!LBRStack.empty()) {
if (!PrevTrDst && !LBRStack.empty()) {
WithColor::warning()
<< "Invalid transfer to external code in LBR record at line "
<< TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine()
<< "\n";
}
break;
// Do not ignore the entire samples, the remaining LBR can still be
// unwound using a context-less stack.
continue;
}
if (IsOutgoing) {
if (!PrevTrDst) {
// This is unpaired outgoing jump which is likely due to interrupt or
// incomplete LBR trace. Ignore current and subsequent entries since
// they are likely in different contexts.
// This is a leading outgoing LBR, we should keep processing the LBRs.
if (LBRStack.empty()) {
NumLeadingOutgoingLBR++;
// Record this LBR since current source and next LBR' target is still
// a valid range.
LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false));
continue;
}
// This is middle unpaired outgoing jump which is likely due to
// interrupt or incomplete LBR trace. Ignore current and subsequent
// entries since they are likely in different contexts.
break;
}
@ -593,9 +603,17 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt,
}
TraceIt.advance();
// Currently intermixed frame from different binaries is not supported.
// Ignore caller frames not from binary of interest.
if (!Binary->addressIsCode(FrameAddr))
break;
if (!Binary->addressIsCode(FrameAddr)) {
if (CallStack.empty())
NumLeafExternalFrame++;
// Push a special value(ExternalAddr) for the external frames so that
// unwinder can still work on this with artificial Call/Return branch.
// After unwinding, the context will be truncated for external frame.
// Also deduplicate the consecutive external addresses.
if (CallStack.empty() || CallStack.back() != ExternalAddr)
CallStack.emplace_back(ExternalAddr);
continue;
}
// We need to translate return address to call address for non-leaf frames.
if (!CallStack.empty()) {
@ -613,6 +631,10 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt,
CallStack.emplace_back(FrameAddr);
}
// Strip out the bottom external addr.
if (CallStack.size() > 1 && CallStack.back() == ExternalAddr)
CallStack.pop_back();
// Skip other unrelated line, find the next valid LBR line
// Note that even for empty call stack, we should skip the address at the
// bottom, otherwise the following pass may generate a truncated callstack
@ -885,6 +907,7 @@ uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) {
}
void PerfScriptReader::parseSample(TraceStream &TraceIt) {
NumTotalSample++;
uint64_t Count = parseAggregatedCount(TraceIt);
assert(Count >= 1 && "Aggregated count should be >= 1!");
parseSample(TraceIt, Count);
@ -1131,6 +1154,11 @@ void PerfScriptReader::parsePerfTraces() {
// Parse perf traces and do aggregation.
parseAndAggregateTrace();
emitWarningSummary(NumLeafExternalFrame, NumTotalSample,
"of samples have leaf external frame in call stack.");
emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample,
"of samples have leading external LBR.");
// Generate unsymbolized profile.
warnTruncatedStack();
warnInvalidRange();

View File

@ -213,6 +213,15 @@ using AggregatedCounter =
Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
// The special frame addresses.
enum SpecialFrameAddr {
// Dummy root of frame trie.
DummyRoot = 0,
// Represent all the addresses outside of current binary.
ExternalAddr = 1,
};
// The state for the unwinder, it doesn't hold the data but only keep the
// pointer/index of the data, While unwinding, the CallStack is changed
// dynamicially and will be recorded as the context of the sample
@ -221,7 +230,7 @@ struct UnwindState {
const ProfiledBinary *Binary;
// Call stack trie node
struct ProfiledFrame {
const uint64_t Address = 0;
const uint64_t Address = DummyRoot;
ProfiledFrame *Parent;
SampleVector RangeSamples;
SampleVector BranchSamples;
@ -241,7 +250,8 @@ struct UnwindState {
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
}
bool isDummyRoot() { return Address == 0; }
bool isDummyRoot() { return Address == DummyRoot; }
bool isExternalFrame() { return Address == ExternalAddr; }
bool isLeafFrame() { return Children.empty(); }
};
@ -262,6 +272,9 @@ struct UnwindState {
bool validateInitialState() {
uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
uint64_t LeafAddr = CurrentLeafFrame->Address;
assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) &&
"External leading LBR should match the leaf frame.");
// When we take a stack sample, ideally the sampling distance between the
// leaf IP of stack and the last LBR target shouldn't be very large.
// Use a heuristic size (0x100) to filter out broken records.
@ -283,8 +296,9 @@ struct UnwindState {
uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
bool IsLastLBR() const { return LBRIndex == 0; }
bool getLBRStackSize() const { return LBRStack.size(); }
void advanceLBR() { LBRIndex++; }
ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
void pushFrame(uint64_t Address) {
@ -412,6 +426,10 @@ struct FrameStack {
ProfiledBinary *Binary;
FrameStack(ProfiledBinary *B) : Binary(B) {}
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
// Truncate the context for external frame since this isn't a real call
// context the compiler will see
if (Cur->isExternalFrame())
return false;
Stack.push_back(Cur->Address);
return true;
}
@ -428,6 +446,10 @@ struct ProbeStack {
ProfiledBinary *Binary;
ProbeStack(ProfiledBinary *B) : Binary(B) {}
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
// Truncate the context for external frame since this isn't a real call
// context the compiler will see
if (Cur->isExternalFrame())
return false;
const MCDecodedPseudoProbe *CallProbe =
Binary->getCallProbeForAddr(Cur->Address);
// We may not find a probe for a merged or external callsite.
@ -500,7 +522,7 @@ private:
void unwindCall(UnwindState &State);
void unwindLinear(UnwindState &State, uint64_t Repeat);
void unwindReturn(UnwindState &State);
void unwindBranchWithinFrame(UnwindState &State);
void unwindBranch(UnwindState &State);
template <typename T>
void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
@ -546,6 +568,10 @@ protected:
ContextSampleCounterMap SampleCounters;
bool ProfileIsCSFlat = false;
uint64_t NumTotalSample = 0;
uint64_t NumLeafExternalFrame = 0;
uint64_t NumLeadingOutgoingLBR = 0;
};
// Read perf script to parse the events and samples.