forked from OSchip/llvm-project
729 lines
26 KiB
C++
729 lines
26 KiB
C++
//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
|
|
#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
|
|
#include "ErrorHandling.h"
|
|
#include "ProfiledBinary.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Regex.h"
|
|
#include <cstdint>
|
|
#include <fstream>
|
|
#include <list>
|
|
#include <map>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
using namespace sampleprof;
|
|
|
|
namespace llvm {
|
|
namespace sampleprof {
|
|
|
|
// Stream based trace line iterator
|
|
class TraceStream {
|
|
std::string CurrentLine;
|
|
std::ifstream Fin;
|
|
bool IsAtEoF = false;
|
|
uint64_t LineNumber = 0;
|
|
|
|
public:
|
|
TraceStream(StringRef Filename) : Fin(Filename.str()) {
|
|
if (!Fin.good())
|
|
exitWithError("Error read input perf script file", Filename);
|
|
advance();
|
|
}
|
|
|
|
StringRef getCurrentLine() {
|
|
assert(!IsAtEoF && "Line iterator reaches the End-of-File!");
|
|
return CurrentLine;
|
|
}
|
|
|
|
uint64_t getLineNumber() { return LineNumber; }
|
|
|
|
bool isAtEoF() { return IsAtEoF; }
|
|
|
|
// Read the next line
|
|
void advance() {
|
|
if (!std::getline(Fin, CurrentLine)) {
|
|
IsAtEoF = true;
|
|
return;
|
|
}
|
|
LineNumber++;
|
|
}
|
|
};
|
|
|
|
// The type of input format.
|
|
enum PerfFormat {
|
|
UnknownFormat = 0,
|
|
PerfData = 1, // Raw linux perf.data.
|
|
PerfScript = 2, // Perf script create by `perf script` command.
|
|
UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen.
|
|
|
|
};
|
|
|
|
// The type of perfscript content.
|
|
enum PerfContent {
|
|
UnknownContent = 0,
|
|
LBR = 1, // Only LBR sample.
|
|
LBRStack = 2, // Hybrid sample including call stack and LBR stack.
|
|
};
|
|
|
|
struct PerfInputFile {
|
|
std::string InputFile;
|
|
PerfFormat Format = PerfFormat::UnknownFormat;
|
|
PerfContent Content = PerfContent::UnknownContent;
|
|
};
|
|
|
|
// The parsed LBR sample entry.
|
|
struct LBREntry {
|
|
uint64_t Source = 0;
|
|
uint64_t Target = 0;
|
|
// An artificial branch stands for a series of consecutive branches starting
|
|
// from the current binary with a transition through external code and
|
|
// eventually landing back in the current binary.
|
|
bool IsArtificial = false;
|
|
LBREntry(uint64_t S, uint64_t T, bool I)
|
|
: Source(S), Target(T), IsArtificial(I) {}
|
|
|
|
#ifndef NDEBUG
|
|
void print() const {
|
|
dbgs() << "from " << format("%#010x", Source) << " to "
|
|
<< format("%#010x", Target);
|
|
if (IsArtificial)
|
|
dbgs() << " Artificial";
|
|
}
|
|
#endif
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
static inline void printLBRStack(const SmallVectorImpl<LBREntry> &LBRStack) {
|
|
for (size_t I = 0; I < LBRStack.size(); I++) {
|
|
dbgs() << "[" << I << "] ";
|
|
LBRStack[I].print();
|
|
dbgs() << "\n";
|
|
}
|
|
}
|
|
|
|
static inline void printCallStack(const SmallVectorImpl<uint64_t> &CallStack) {
|
|
for (size_t I = 0; I < CallStack.size(); I++) {
|
|
dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n";
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Hash interface for generic data of type T
|
|
// Data should implement a \fn getHashCode and a \fn isEqual
|
|
// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
|
|
// i.e we explicitly calculate hash of derived class, assign to base class's
|
|
// HashCode. This also provides the flexibility for calculating the hash code
|
|
// incrementally(like rolling hash) during frame stack unwinding since unwinding
|
|
// only changes the leaf of frame stack. \fn isEqual is a virtual function,
|
|
// which will have perf overhead. In the future, if we redesign a better hash
|
|
// function, then we can just skip this or switch to non-virtual function(like
|
|
// just ignore comparision if hash conflicts probabilities is low)
|
|
template <class T> class Hashable {
|
|
public:
|
|
std::shared_ptr<T> Data;
|
|
Hashable(const std::shared_ptr<T> &D) : Data(D) {}
|
|
|
|
// Hash code generation
|
|
struct Hash {
|
|
uint64_t operator()(const Hashable<T> &Key) const {
|
|
// Don't make it virtual for getHashCode
|
|
uint64_t Hash = Key.Data->getHashCode();
|
|
assert(Hash && "Should generate HashCode for it!");
|
|
return Hash;
|
|
}
|
|
};
|
|
|
|
// Hash equal
|
|
struct Equal {
|
|
bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const {
|
|
// Precisely compare the data, vtable will have overhead.
|
|
return LHS.Data->isEqual(RHS.Data.get());
|
|
}
|
|
};
|
|
|
|
T *getPtr() const { return Data.get(); }
|
|
};
|
|
|
|
struct PerfSample {
|
|
// LBR stack recorded in FIFO order.
|
|
SmallVector<LBREntry, 16> LBRStack;
|
|
// Call stack recorded in FILO(leaf to root) order, it's used for CS-profile
|
|
// generation
|
|
SmallVector<uint64_t, 16> CallStack;
|
|
|
|
virtual ~PerfSample() = default;
|
|
uint64_t getHashCode() const {
|
|
// Use simple DJB2 hash
|
|
auto HashCombine = [](uint64_t H, uint64_t V) {
|
|
return ((H << 5) + H) + V;
|
|
};
|
|
uint64_t Hash = 5381;
|
|
for (const auto &Value : CallStack) {
|
|
Hash = HashCombine(Hash, Value);
|
|
}
|
|
for (const auto &Entry : LBRStack) {
|
|
Hash = HashCombine(Hash, Entry.Source);
|
|
Hash = HashCombine(Hash, Entry.Target);
|
|
}
|
|
return Hash;
|
|
}
|
|
|
|
bool isEqual(const PerfSample *Other) const {
|
|
const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack;
|
|
const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
|
|
|
|
if (CallStack.size() != OtherCallStack.size() ||
|
|
LBRStack.size() != OtherLBRStack.size())
|
|
return false;
|
|
|
|
if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin()))
|
|
return false;
|
|
|
|
for (size_t I = 0; I < OtherLBRStack.size(); I++) {
|
|
if (LBRStack[I].Source != OtherLBRStack[I].Source ||
|
|
LBRStack[I].Target != OtherLBRStack[I].Target)
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
void print() const {
|
|
dbgs() << "LBR stack\n";
|
|
printLBRStack(LBRStack);
|
|
dbgs() << "Call stack\n";
|
|
printCallStack(CallStack);
|
|
}
|
|
#endif
|
|
};
|
|
// After parsing the sample, we record the samples by aggregating them
|
|
// into this counter. The key stores the sample data and the value is
|
|
// the sample repeat times.
|
|
using AggregatedCounter =
|
|
std::unordered_map<Hashable<PerfSample>, uint64_t,
|
|
Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
|
|
|
|
using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
|
|
|
|
// The state for the unwinder, it doesn't hold the data but only keep the
|
|
// pointer/index of the data, While unwinding, the CallStack is changed
|
|
// dynamicially and will be recorded as the context of the sample
|
|
struct UnwindState {
|
|
// Profiled binary that current frame address belongs to
|
|
const ProfiledBinary *Binary;
|
|
// Call stack trie node
|
|
struct ProfiledFrame {
|
|
const uint64_t Address = DummyRoot;
|
|
ProfiledFrame *Parent;
|
|
SampleVector RangeSamples;
|
|
SampleVector BranchSamples;
|
|
std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children;
|
|
|
|
ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr)
|
|
: Address(Addr), Parent(P) {}
|
|
ProfiledFrame *getOrCreateChildFrame(uint64_t Address) {
|
|
assert(Address && "Address can't be zero!");
|
|
auto Ret = Children.emplace(
|
|
Address, std::make_unique<ProfiledFrame>(Address, this));
|
|
return Ret.first->second.get();
|
|
}
|
|
void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) {
|
|
RangeSamples.emplace_back(std::make_tuple(Start, End, Count));
|
|
}
|
|
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
|
|
BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
|
|
}
|
|
bool isDummyRoot() { return Address == DummyRoot; }
|
|
bool isExternalFrame() { return Address == ExternalAddr; }
|
|
bool isLeafFrame() { return Children.empty(); }
|
|
};
|
|
|
|
ProfiledFrame DummyTrieRoot;
|
|
ProfiledFrame *CurrentLeafFrame;
|
|
// Used to fall through the LBR stack
|
|
uint32_t LBRIndex = 0;
|
|
// Reference to PerfSample.LBRStack
|
|
const SmallVector<LBREntry, 16> &LBRStack;
|
|
// Used to iterate the address range
|
|
InstructionPointer InstPtr;
|
|
UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary)
|
|
: Binary(Binary), LBRStack(Sample->LBRStack),
|
|
InstPtr(Binary, Sample->CallStack.front()) {
|
|
initFrameTrie(Sample->CallStack);
|
|
}
|
|
|
|
bool validateInitialState() {
|
|
uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
|
|
uint64_t LeafAddr = CurrentLeafFrame->Address;
|
|
assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) &&
|
|
"External leading LBR should match the leaf frame.");
|
|
|
|
// When we take a stack sample, ideally the sampling distance between the
|
|
// leaf IP of stack and the last LBR target shouldn't be very large.
|
|
// Use a heuristic size (0x100) to filter out broken records.
|
|
if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) {
|
|
WithColor::warning() << "Bogus trace: stack tip = "
|
|
<< format("%#010x", LeafAddr)
|
|
<< ", LBR tip = " << format("%#010x\n", LBRLeaf);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void checkStateConsistency() {
|
|
assert(InstPtr.Address == CurrentLeafFrame->Address &&
|
|
"IP should align with context leaf");
|
|
}
|
|
|
|
bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
|
|
uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
|
|
uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
|
|
const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
|
|
bool IsLastLBR() const { return LBRIndex == 0; }
|
|
bool getLBRStackSize() const { return LBRStack.size(); }
|
|
void advanceLBR() { LBRIndex++; }
|
|
ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
|
|
|
|
void pushFrame(uint64_t Address) {
|
|
CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address);
|
|
}
|
|
|
|
void switchToFrame(uint64_t Address) {
|
|
if (CurrentLeafFrame->Address == Address)
|
|
return;
|
|
CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address);
|
|
}
|
|
|
|
void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; }
|
|
|
|
void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; }
|
|
|
|
void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) {
|
|
ProfiledFrame *Cur = &DummyTrieRoot;
|
|
for (auto Address : reverse(CallStack)) {
|
|
Cur = Cur->getOrCreateChildFrame(Address);
|
|
}
|
|
CurrentLeafFrame = Cur;
|
|
}
|
|
|
|
ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; }
|
|
};
|
|
|
|
// Base class for sample counter key with context
|
|
struct ContextKey {
|
|
uint64_t HashCode = 0;
|
|
virtual ~ContextKey() = default;
|
|
uint64_t getHashCode() {
|
|
if (HashCode == 0)
|
|
genHashCode();
|
|
return HashCode;
|
|
}
|
|
virtual void genHashCode() = 0;
|
|
virtual bool isEqual(const ContextKey *K) const {
|
|
return HashCode == K->HashCode;
|
|
};
|
|
|
|
// Utilities for LLVM-style RTTI
|
|
enum ContextKind { CK_StringBased, CK_ProbeBased };
|
|
const ContextKind Kind;
|
|
ContextKind getKind() const { return Kind; }
|
|
ContextKey(ContextKind K) : Kind(K){};
|
|
};
|
|
|
|
// String based context id
|
|
struct StringBasedCtxKey : public ContextKey {
|
|
SampleContextFrameVector Context;
|
|
|
|
bool WasLeafInlined;
|
|
StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){};
|
|
static bool classof(const ContextKey *K) {
|
|
return K->getKind() == CK_StringBased;
|
|
}
|
|
|
|
bool isEqual(const ContextKey *K) const override {
|
|
const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K);
|
|
return Context == Other->Context;
|
|
}
|
|
|
|
void genHashCode() override {
|
|
HashCode = hash_value(SampleContextFrames(Context));
|
|
}
|
|
};
|
|
|
|
// Probe based context key as the intermediate key of context
|
|
// String based context key will introduce redundant string handling
|
|
// since the callee context is inferred from the context string which
|
|
// need to be splitted by '@' to get the last location frame, so we
|
|
// can just use probe instead and generate the string in the end.
|
|
struct ProbeBasedCtxKey : public ContextKey {
|
|
SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
|
|
|
|
ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {}
|
|
static bool classof(const ContextKey *K) {
|
|
return K->getKind() == CK_ProbeBased;
|
|
}
|
|
|
|
bool isEqual(const ContextKey *K) const override {
|
|
const ProbeBasedCtxKey *O = dyn_cast<ProbeBasedCtxKey>(K);
|
|
assert(O != nullptr && "Probe based key shouldn't be null in isEqual");
|
|
return std::equal(Probes.begin(), Probes.end(), O->Probes.begin(),
|
|
O->Probes.end());
|
|
}
|
|
|
|
void genHashCode() override {
|
|
for (const auto *P : Probes) {
|
|
HashCode = hash_combine(HashCode, P);
|
|
}
|
|
if (HashCode == 0) {
|
|
// Avoid zero value of HashCode when it's an empty list
|
|
HashCode = 1;
|
|
}
|
|
}
|
|
};
|
|
|
|
// The counter of branch samples for one function indexed by the branch,
|
|
// which is represented as the source and target offset pair.
|
|
using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
|
|
// The counter of range samples for one function indexed by the range,
|
|
// which is represented as the start and end offset pair.
|
|
using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
|
|
// Wrapper for sample counters including range counter and branch counter
|
|
struct SampleCounter {
|
|
RangeSample RangeCounter;
|
|
BranchSample BranchCounter;
|
|
|
|
void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
|
|
assert(Start <= End && "Invalid instruction range");
|
|
RangeCounter[{Start, End}] += Repeat;
|
|
}
|
|
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
|
|
BranchCounter[{Source, Target}] += Repeat;
|
|
}
|
|
};
|
|
|
|
// Sample counter with context to support context-sensitive profile
|
|
using ContextSampleCounterMap =
|
|
std::unordered_map<Hashable<ContextKey>, SampleCounter,
|
|
Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
|
|
|
|
struct FrameStack {
|
|
SmallVector<uint64_t, 16> Stack;
|
|
ProfiledBinary *Binary;
|
|
FrameStack(ProfiledBinary *B) : Binary(B) {}
|
|
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
|
|
assert(!Cur->isExternalFrame() &&
|
|
"External frame's not expected for context stack.");
|
|
Stack.push_back(Cur->Address);
|
|
return true;
|
|
}
|
|
|
|
void popFrame() {
|
|
if (!Stack.empty())
|
|
Stack.pop_back();
|
|
}
|
|
std::shared_ptr<StringBasedCtxKey> getContextKey();
|
|
};
|
|
|
|
struct ProbeStack {
|
|
SmallVector<const MCDecodedPseudoProbe *, 16> Stack;
|
|
ProfiledBinary *Binary;
|
|
ProbeStack(ProfiledBinary *B) : Binary(B) {}
|
|
bool pushFrame(UnwindState::ProfiledFrame *Cur) {
|
|
assert(!Cur->isExternalFrame() &&
|
|
"External frame's not expected for context stack.");
|
|
const MCDecodedPseudoProbe *CallProbe =
|
|
Binary->getCallProbeForAddr(Cur->Address);
|
|
// We may not find a probe for a merged or external callsite.
|
|
// Callsite merging may cause the loss of original probe IDs.
|
|
// Cutting off the context from here since the inliner will
|
|
// not know how to consume a context with unknown callsites.
|
|
if (!CallProbe)
|
|
return false;
|
|
Stack.push_back(CallProbe);
|
|
return true;
|
|
}
|
|
|
|
void popFrame() {
|
|
if (!Stack.empty())
|
|
Stack.pop_back();
|
|
}
|
|
// Use pseudo probe based context key to get the sample counter
|
|
// A context stands for a call path from 'main' to an uninlined
|
|
// callee with all inline frames recovered on that path. The probes
|
|
// belonging to that call path is the probes either originated from
|
|
// the callee or from any functions inlined into the callee. Since
|
|
// pseudo probes are organized in a tri-tree style after decoded,
|
|
// the tree path from the tri-tree root (which is the uninlined
|
|
// callee) to the probe node forms an inline context.
|
|
// Here we use a list of probe(pointer) as the context key to speed up
|
|
// aggregation and the final context string will be generate in
|
|
// ProfileGenerator
|
|
std::shared_ptr<ProbeBasedCtxKey> getContextKey();
|
|
};
|
|
|
|
/*
|
|
As in hybrid sample we have a group of LBRs and the most recent sampling call
|
|
stack, we can walk through those LBRs to infer more call stacks which would be
|
|
used as context for profile. VirtualUnwinder is the class to do the call stack
|
|
unwinding based on LBR state. Two types of unwinding are processd here:
|
|
1) LBR unwinding and 2) linear range unwinding.
|
|
Specifically, for each LBR entry(can be classified into call, return, regular
|
|
branch), LBR unwinding will replay the operation by pushing, popping or
|
|
switching leaf frame towards the call stack and since the initial call stack
|
|
is most recently sampled, the replay should be in anti-execution order, i.e. for
|
|
the regular case, pop the call stack when LBR is call, push frame on call stack
|
|
when LBR is return. After each LBR processed, it also needs to align with the
|
|
next LBR by going through instructions from previous LBR's target to current
|
|
LBR's source, which is the linear unwinding. As instruction from linear range
|
|
can come from different function by inlining, linear unwinding will do the range
|
|
splitting and record counters by the range with same inline context. Over those
|
|
unwinding process we will record each call stack as context id and LBR/linear
|
|
range as sample counter for further CS profile generation.
|
|
*/
|
|
class VirtualUnwinder {
|
|
public:
|
|
VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B)
|
|
: CtxCounterMap(Counter), Binary(B) {}
|
|
bool unwind(const PerfSample *Sample, uint64_t Repeat);
|
|
std::set<uint64_t> &getUntrackedCallsites() { return UntrackedCallsites; }
|
|
|
|
uint64_t NumTotalBranches = 0;
|
|
uint64_t NumExtCallBranch = 0;
|
|
uint64_t NumMissingExternalFrame = 0;
|
|
uint64_t NumMismatchedProEpiBranch = 0;
|
|
uint64_t NumMismatchedExtCallBranch = 0;
|
|
|
|
private:
|
|
bool isCallState(UnwindState &State) const {
|
|
// The tail call frame is always missing here in stack sample, we will
|
|
// use a specific tail call tracker to infer it.
|
|
return Binary->addressIsCall(State.getCurrentLBRSource());
|
|
}
|
|
|
|
bool isReturnState(UnwindState &State) const {
|
|
// Simply check addressIsReturn, as ret is always reliable, both for
|
|
// regular call and tail call.
|
|
if (!Binary->addressIsReturn(State.getCurrentLBRSource()))
|
|
return false;
|
|
|
|
// In a callback case, a return from internal code, say A, to external
|
|
// runtime can happen. The external runtime can then call back to
|
|
// another internal routine, say B. Making an artificial branch that
|
|
// looks like a return from A to B can confuse the unwinder to treat
|
|
// the instruction before B as the call instruction. Here we detect this
|
|
// case if the return target is not the next inst of call inst, then we just
|
|
// do not treat it as a return.
|
|
uint64_t CallAddr =
|
|
Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget());
|
|
return (CallAddr != 0);
|
|
}
|
|
|
|
void unwindCall(UnwindState &State);
|
|
void unwindLinear(UnwindState &State, uint64_t Repeat);
|
|
void unwindReturn(UnwindState &State);
|
|
void unwindBranch(UnwindState &State);
|
|
|
|
template <typename T>
|
|
void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
|
|
// Collect each samples on trie node by DFS traversal
|
|
template <typename T>
|
|
void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack);
|
|
void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur);
|
|
|
|
void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
|
|
uint64_t Repeat);
|
|
void recordBranchCount(const LBREntry &Branch, UnwindState &State,
|
|
uint64_t Repeat);
|
|
|
|
ContextSampleCounterMap *CtxCounterMap;
|
|
// Profiled binary that current frame address belongs to
|
|
ProfiledBinary *Binary;
|
|
// Keep track of all untracked callsites
|
|
std::set<uint64_t> UntrackedCallsites;
|
|
};
|
|
|
|
// Read perf trace to parse the events and samples.
|
|
class PerfReaderBase {
|
|
public:
|
|
PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace)
|
|
: Binary(B), PerfTraceFile(PerfTrace) {
|
|
// Initialize the base address to preferred address.
|
|
Binary->setBaseAddress(Binary->getPreferredBaseAddress());
|
|
};
|
|
virtual ~PerfReaderBase() = default;
|
|
static std::unique_ptr<PerfReaderBase> create(ProfiledBinary *Binary,
|
|
PerfInputFile &PerfInput);
|
|
|
|
// Entry of the reader to parse multiple perf traces
|
|
virtual void parsePerfTraces() = 0;
|
|
const ContextSampleCounterMap &getSampleCounters() const {
|
|
return SampleCounters;
|
|
}
|
|
bool profileIsCSFlat() { return ProfileIsCSFlat; }
|
|
|
|
protected:
|
|
ProfiledBinary *Binary = nullptr;
|
|
StringRef PerfTraceFile;
|
|
|
|
ContextSampleCounterMap SampleCounters;
|
|
bool ProfileIsCSFlat = false;
|
|
|
|
uint64_t NumTotalSample = 0;
|
|
uint64_t NumLeafExternalFrame = 0;
|
|
uint64_t NumLeadingOutgoingLBR = 0;
|
|
};
|
|
|
|
// Read perf script to parse the events and samples.
|
|
class PerfScriptReader : public PerfReaderBase {
|
|
public:
|
|
PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace)
|
|
: PerfReaderBase(B, PerfTrace){};
|
|
|
|
// Entry of the reader to parse multiple perf traces
|
|
virtual void parsePerfTraces() override;
|
|
// Generate perf script from perf data
|
|
static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary,
|
|
PerfInputFile &File);
|
|
// Extract perf script type by peaking at the input
|
|
static PerfContent checkPerfScriptType(StringRef FileName);
|
|
|
|
protected:
|
|
// The parsed MMap event
|
|
struct MMapEvent {
|
|
uint64_t PID = 0;
|
|
uint64_t Address = 0;
|
|
uint64_t Size = 0;
|
|
uint64_t Offset = 0;
|
|
StringRef BinaryPath;
|
|
};
|
|
|
|
// Check whether a given line is LBR sample
|
|
static bool isLBRSample(StringRef Line);
|
|
// Check whether a given line is MMAP event
|
|
static bool isMMap2Event(StringRef Line);
|
|
// Parse a single line of a PERF_RECORD_MMAP2 event looking for a
|
|
// mapping between the binary name and its memory layout.
|
|
static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line,
|
|
MMapEvent &MMap);
|
|
// Update base address based on mmap events
|
|
void updateBinaryAddress(const MMapEvent &Event);
|
|
// Parse mmap event and update binary address
|
|
void parseMMap2Event(TraceStream &TraceIt);
|
|
// Parse perf events/samples and do aggregation
|
|
void parseAndAggregateTrace();
|
|
// Parse either an MMAP event or a perf sample
|
|
void parseEventOrSample(TraceStream &TraceIt);
|
|
// Warn if the relevant mmap event is missing.
|
|
void warnIfMissingMMap();
|
|
// Emit accumulate warnings.
|
|
void warnTruncatedStack();
|
|
// Warn if range is invalid.
|
|
void warnInvalidRange();
|
|
// Extract call stack from the perf trace lines
|
|
bool extractCallstack(TraceStream &TraceIt,
|
|
SmallVectorImpl<uint64_t> &CallStack);
|
|
// Extract LBR stack from one perf trace line
|
|
bool extractLBRStack(TraceStream &TraceIt,
|
|
SmallVectorImpl<LBREntry> &LBRStack);
|
|
uint64_t parseAggregatedCount(TraceStream &TraceIt);
|
|
// Parse one sample from multiple perf lines, override this for different
|
|
// sample type
|
|
void parseSample(TraceStream &TraceIt);
|
|
// An aggregated count is given to indicate how many times the sample is
|
|
// repeated.
|
|
virtual void parseSample(TraceStream &TraceIt, uint64_t Count){};
|
|
void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat);
|
|
// Post process the profile after trace aggregation, we will do simple range
|
|
// overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample).
|
|
virtual void generateUnsymbolizedProfile();
|
|
void writeUnsymbolizedProfile(StringRef Filename);
|
|
void writeUnsymbolizedProfile(raw_fd_ostream &OS);
|
|
|
|
// Samples with the repeating time generated by the perf reader
|
|
AggregatedCounter AggregatedSamples;
|
|
// Keep track of all invalid return addresses
|
|
std::set<uint64_t> InvalidReturnAddresses;
|
|
};
|
|
|
|
/*
|
|
The reader of LBR only perf script.
|
|
A typical LBR sample is like:
|
|
40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
|
|
... 0x4005c8/0x4005dc/P/-/-/0
|
|
*/
|
|
class LBRPerfReader : public PerfScriptReader {
|
|
public:
|
|
LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace)
|
|
: PerfScriptReader(Binary, PerfTrace){};
|
|
// Parse the LBR only sample.
|
|
virtual void parseSample(TraceStream &TraceIt, uint64_t Count) override;
|
|
};
|
|
|
|
/*
|
|
Hybrid perf script includes a group of hybrid samples(LBRs + call stack),
|
|
which is used to generate CS profile. An example of hybrid sample:
|
|
4005dc # call stack leaf
|
|
400634
|
|
400684 # call stack root
|
|
0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
|
|
... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries
|
|
*/
|
|
class HybridPerfReader : public PerfScriptReader {
|
|
public:
|
|
HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace)
|
|
: PerfScriptReader(Binary, PerfTrace){};
|
|
// Parse the hybrid sample including the call and LBR line
|
|
void parseSample(TraceStream &TraceIt, uint64_t Count) override;
|
|
void generateUnsymbolizedProfile() override;
|
|
|
|
private:
|
|
// Unwind the hybrid samples after aggregration
|
|
void unwindSamples();
|
|
};
|
|
|
|
/*
|
|
Format of unsymbolized profile:
|
|
|
|
[frame1 @ frame2 @ ...] # If it's a CS profile
|
|
number of entries in RangeCounter
|
|
from_1-to_1:count_1
|
|
from_2-to_2:count_2
|
|
......
|
|
from_n-to_n:count_n
|
|
number of entries in BranchCounter
|
|
src_1->dst_1:count_1
|
|
src_2->dst_2:count_2
|
|
......
|
|
src_n->dst_n:count_n
|
|
[frame1 @ frame2 @ ...] # Next context
|
|
......
|
|
|
|
Note that non-CS profile doesn't have the empty `[]` context.
|
|
*/
|
|
class UnsymbolizedProfileReader : public PerfReaderBase {
|
|
public:
|
|
UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace)
|
|
: PerfReaderBase(Binary, PerfTrace){};
|
|
void parsePerfTraces() override;
|
|
|
|
private:
|
|
void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters);
|
|
void readUnsymbolizedProfile(StringRef Filename);
|
|
|
|
std::unordered_set<std::string> ContextStrSet;
|
|
};
|
|
|
|
} // end namespace sampleprof
|
|
} // end namespace llvm
|
|
|
|
#endif
|