forked from OSchip/llvm-project
[perf2bolt] Add support for generating autofdo input
Summary: Autofdo tools support. (cherry picked from FBD13779026)
This commit is contained in:
parent
c6ce2abb7d
commit
af81c7ff80
|
@ -45,6 +45,13 @@ BasicAggregation("nl",
|
||||||
cl::ZeroOrMore,
|
cl::ZeroOrMore,
|
||||||
cl::cat(AggregatorCategory));
|
cl::cat(AggregatorCategory));
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
WriteAutoFDOData("autofdo",
|
||||||
|
cl::desc("generate autofdo textual data instead of bolt data"),
|
||||||
|
cl::init(false),
|
||||||
|
cl::ZeroOrMore,
|
||||||
|
cl::cat(AggregatorCategory));
|
||||||
|
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
ReadPreAggregated("pa",
|
ReadPreAggregated("pa",
|
||||||
cl::desc("skip perf and read data from a pre-aggregated file format"),
|
cl::desc("skip perf and read data from a pre-aggregated file format"),
|
||||||
|
@ -123,7 +130,7 @@ void DataAggregator::start(StringRef PerfDataFilename) {
|
||||||
} else {
|
} else {
|
||||||
launchPerfProcess("branch events",
|
launchPerfProcess("branch events",
|
||||||
MainEventsPPI,
|
MainEventsPPI,
|
||||||
"script -F pid,brstack",
|
"script -F pid,ip,brstack",
|
||||||
/*Wait = */false);
|
/*Wait = */false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -323,6 +330,75 @@ void DataAggregator::parsePreAggregated() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::error_code DataAggregator::writeAutoFDOData() {
|
||||||
|
outs() << "PERF2BOLT: writing data for autofdo tools...\n";
|
||||||
|
NamedRegionTimer T("writeAutoFDO", "Processing branch events",
|
||||||
|
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
||||||
|
|
||||||
|
std::error_code EC;
|
||||||
|
raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None);
|
||||||
|
if (EC)
|
||||||
|
return EC;
|
||||||
|
|
||||||
|
// Format:
|
||||||
|
// number of unique traces
|
||||||
|
// from_1-to_1:count_1
|
||||||
|
// from_2-to_2:count_2
|
||||||
|
// ......
|
||||||
|
// from_n-to_n:count_n
|
||||||
|
// number of unique sample addresses
|
||||||
|
// addr_1:count_1
|
||||||
|
// addr_2:count_2
|
||||||
|
// ......
|
||||||
|
// addr_n:count_n
|
||||||
|
// number of unique LBR entries
|
||||||
|
// src_1->dst_1:count_1
|
||||||
|
// src_2->dst_2:count_2
|
||||||
|
// ......
|
||||||
|
// src_n->dst_n:count_n
|
||||||
|
|
||||||
|
const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress;
|
||||||
|
|
||||||
|
// AutoFDO addresses are relative to the first allocated loadable program
|
||||||
|
// segment
|
||||||
|
auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t {
|
||||||
|
if (Address < FirstAllocAddress)
|
||||||
|
return 0;
|
||||||
|
return Address - FirstAllocAddress;
|
||||||
|
};
|
||||||
|
|
||||||
|
OutFile << FallthroughLBRs.size() << "\n";
|
||||||
|
for (const auto &AggrLBR : FallthroughLBRs) {
|
||||||
|
auto &Trace = AggrLBR.first;
|
||||||
|
auto &Info = AggrLBR.second;
|
||||||
|
OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-"
|
||||||
|
<< Twine::utohexstr(filterAddress(Trace.To)) << ":"
|
||||||
|
<< (Info.InternCount + Info.ExternCount) << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
OutFile << BasicSamples.size() << "\n";
|
||||||
|
for (const auto &Sample : BasicSamples) {
|
||||||
|
auto PC = Sample.first;
|
||||||
|
auto HitCount = Sample.second;
|
||||||
|
OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
OutFile << BranchLBRs.size() << "\n";
|
||||||
|
for (const auto &AggrLBR : BranchLBRs) {
|
||||||
|
auto &Trace = AggrLBR.first;
|
||||||
|
auto &Info = AggrLBR.second;
|
||||||
|
OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->"
|
||||||
|
<< Twine::utohexstr(filterAddress(Trace.To)) << ":"
|
||||||
|
<< Info.TakenCount << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, "
|
||||||
|
<< BasicSamples.size() << " sample addresses and " << BranchLBRs.size()
|
||||||
|
<< " unique branches to " << OutputFDataName << "\n";
|
||||||
|
|
||||||
|
return std::error_code();
|
||||||
|
}
|
||||||
|
|
||||||
void DataAggregator::parseProfile(
|
void DataAggregator::parseProfile(
|
||||||
BinaryContext &BC,
|
BinaryContext &BC,
|
||||||
std::map<uint64_t, BinaryFunction> &BFs) {
|
std::map<uint64_t, BinaryFunction> &BFs) {
|
||||||
|
@ -388,6 +464,15 @@ void DataAggregator::parseProfile(
|
||||||
errs() << "PERF2BOLT: failed to parse samples\n";
|
errs() << "PERF2BOLT: failed to parse samples\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We can finish early if the goal is just to generate data for autofdo
|
||||||
|
if (opts::WriteAutoFDOData) {
|
||||||
|
if (std::error_code EC = writeAutoFDOData()) {
|
||||||
|
errs() << "Error writing autofdo data to file: " << EC.message() << "\n";
|
||||||
|
}
|
||||||
|
deleteTempFiles();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
// Special handling for memory events
|
// Special handling for memory events
|
||||||
std::string Error;
|
std::string Error;
|
||||||
auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error);
|
auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error);
|
||||||
|
@ -475,8 +560,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
|
||||||
return &FI->second;
|
return &FI->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
|
||||||
DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) {
|
uint64_t Count) {
|
||||||
auto I = FuncsToSamples.find(Func.getNames()[0]);
|
auto I = FuncsToSamples.find(Func.getNames()[0]);
|
||||||
if (I == FuncsToSamples.end()) {
|
if (I == FuncsToSamples.end()) {
|
||||||
bool Success;
|
bool Success;
|
||||||
|
@ -485,7 +570,7 @@ DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) {
|
||||||
FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
|
FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
|
||||||
}
|
}
|
||||||
|
|
||||||
I->second.bumpCount(Address - Func.getAddress());
|
I->second.bumpCount(Address - Func.getAddress(), Count);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -682,6 +767,16 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
|
||||||
return Res;
|
return Res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while (checkAndConsumeFS()) {}
|
||||||
|
|
||||||
|
auto PCRes = parseHexField(FieldSeparator, true);
|
||||||
|
if (std::error_code EC = PCRes.getError())
|
||||||
|
return EC;
|
||||||
|
Res.PC = PCRes.get();
|
||||||
|
|
||||||
|
if (checkAndConsumeNewLine())
|
||||||
|
return Res;
|
||||||
|
|
||||||
while (!checkAndConsumeNewLine()) {
|
while (!checkAndConsumeNewLine()) {
|
||||||
checkAndConsumeFS();
|
checkAndConsumeFS();
|
||||||
|
|
||||||
|
@ -890,6 +985,9 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||||
return EC;
|
return EC;
|
||||||
|
|
||||||
auto &Sample = SampleRes.get();
|
auto &Sample = SampleRes.get();
|
||||||
|
if (opts::WriteAutoFDOData)
|
||||||
|
++BasicSamples[Sample.PC];
|
||||||
|
|
||||||
if (Sample.LBR.empty())
|
if (Sample.LBR.empty())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -1041,7 +1139,8 @@ std::error_code DataAggregator::parseBasicEvents() {
|
||||||
if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC))
|
if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC))
|
||||||
BF->setHasProfileAvailable();
|
BF->setHasProfileAvailable();
|
||||||
|
|
||||||
BasicSamples.emplace_back(std::move(Sample.get()));
|
++BasicSamples[Sample->PC];
|
||||||
|
EventNames.insert(Sample->EventName);
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::error_code();
|
return std::error_code();
|
||||||
|
@ -1052,17 +1151,19 @@ void DataAggregator::processBasicEvents() {
|
||||||
NamedRegionTimer T("processBasic", "Processing basic events",
|
NamedRegionTimer T("processBasic", "Processing basic events",
|
||||||
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
|
||||||
uint64_t OutOfRangeSamples{0};
|
uint64_t OutOfRangeSamples{0};
|
||||||
|
uint64_t NumSamples{0};
|
||||||
for (auto &Sample : BasicSamples) {
|
for (auto &Sample : BasicSamples) {
|
||||||
auto *Func = getBinaryFunctionContainingAddress(Sample.PC);
|
const auto PC = Sample.first;
|
||||||
|
const auto HitCount = Sample.second;
|
||||||
|
NumSamples += HitCount;
|
||||||
|
auto *Func = getBinaryFunctionContainingAddress(PC);
|
||||||
if (!Func) {
|
if (!Func) {
|
||||||
++OutOfRangeSamples;
|
OutOfRangeSamples += HitCount;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
doSample(*Func, Sample.PC);
|
doSample(*Func, PC, HitCount);
|
||||||
EventNames.insert(Sample.EventName);
|
|
||||||
}
|
}
|
||||||
const auto NumSamples = BasicSamples.size();
|
|
||||||
outs() << "PERF2BOLT: read " << NumSamples << " samples\n";
|
outs() << "PERF2BOLT: read " << NumSamples << " samples\n";
|
||||||
|
|
||||||
outs() << "PERF2BOLT: out of range samples recorded in unknown regions: "
|
outs() << "PERF2BOLT: out of range samples recorded in unknown regions: "
|
||||||
|
|
|
@ -53,6 +53,7 @@ class DataAggregator : public DataReader {
|
||||||
|
|
||||||
struct PerfBranchSample {
|
struct PerfBranchSample {
|
||||||
SmallVector<LBREntry, 16> LBR;
|
SmallVector<LBREntry, 16> LBR;
|
||||||
|
uint64_t PC;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PerfBasicSample {
|
struct PerfBasicSample {
|
||||||
|
@ -106,7 +107,7 @@ class DataAggregator : public DataReader {
|
||||||
std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
|
std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
|
||||||
std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
|
std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
|
||||||
std::vector<AggregatedLBREntry> AggregatedLBRs;
|
std::vector<AggregatedLBREntry> AggregatedLBRs;
|
||||||
std::vector<PerfBasicSample> BasicSamples;
|
std::unordered_map<uint64_t, uint64_t> BasicSamples;
|
||||||
std::vector<PerfMemSample> MemSamples;
|
std::vector<PerfMemSample> MemSamples;
|
||||||
|
|
||||||
template<typename T> void clear(T& Container) {
|
template<typename T> void clear(T& Container) {
|
||||||
|
@ -197,7 +198,7 @@ class DataAggregator : public DataReader {
|
||||||
|
|
||||||
/// Semantic actions - parser hooks to interpret parsed perf samples
|
/// Semantic actions - parser hooks to interpret parsed perf samples
|
||||||
/// Register a sample (non-LBR mode), i.e. a new hit at \p Address
|
/// Register a sample (non-LBR mode), i.e. a new hit at \p Address
|
||||||
bool doSample(BinaryFunction &Func, const uint64_t Address);
|
bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
|
||||||
|
|
||||||
/// Register an intraprocedural branch \p Branch.
|
/// Register an intraprocedural branch \p Branch.
|
||||||
bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
|
bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
|
||||||
|
@ -256,6 +257,9 @@ class DataAggregator : public DataReader {
|
||||||
/// Process all branch events.
|
/// Process all branch events.
|
||||||
void processBranchEvents();
|
void processBranchEvents();
|
||||||
|
|
||||||
|
/// This member function supports generating data for AutoFDO LLVM tools.
|
||||||
|
std::error_code writeAutoFDOData();
|
||||||
|
|
||||||
/// Parse the full output generated by perf script to report non-LBR samples.
|
/// Parse the full output generated by perf script to report non-LBR samples.
|
||||||
std::error_code parseBasicEvents();
|
std::error_code parseBasicEvents();
|
||||||
|
|
||||||
|
|
|
@ -118,15 +118,15 @@ FuncSampleData::getSamples(uint64_t Start, uint64_t End) const {
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FuncSampleData::bumpCount(uint64_t Offset) {
|
void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
|
||||||
auto Iter = Index.find(Offset);
|
auto Iter = Index.find(Offset);
|
||||||
if (Iter == Index.end()) {
|
if (Iter == Index.end()) {
|
||||||
Data.emplace_back(Location(true, Name, Offset), 1);
|
Data.emplace_back(Location(true, Name, Offset), Count);
|
||||||
Index[Offset] = Data.size() - 1;
|
Index[Offset] = Data.size() - 1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto &SI = Data[Iter->second];
|
auto &SI = Data[Iter->second];
|
||||||
++SI.Hits;
|
SI.Hits += Count;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo,
|
void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo,
|
||||||
|
|
|
@ -285,7 +285,7 @@ struct FuncSampleData {
|
||||||
/// Aggregation helper
|
/// Aggregation helper
|
||||||
DenseMap<uint64_t, size_t> Index;
|
DenseMap<uint64_t, size_t> Index;
|
||||||
|
|
||||||
void bumpCount(uint64_t Offset);
|
void bumpCount(uint64_t Offset, uint64_t Count);
|
||||||
};
|
};
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
Loading…
Reference in New Issue