From af81c7ff803a9f5c47e112cd13eb73feee0e51e6 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 22 Jan 2019 17:21:45 -0800 Subject: [PATCH] [perf2bolt] Add support for generating autofdo input Summary: Autofdo tools support. (cherry picked from FBD13779026) --- bolt/src/DataAggregator.cpp | 121 +++++++++++++++++++++++++++++++++--- bolt/src/DataAggregator.h | 8 ++- bolt/src/DataReader.cpp | 6 +- bolt/src/DataReader.h | 2 +- 4 files changed, 121 insertions(+), 16 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 00d5451007bd..29b21609819a 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -45,6 +45,13 @@ BasicAggregation("nl", cl::ZeroOrMore, cl::cat(AggregatorCategory)); +static cl::opt +WriteAutoFDOData("autofdo", + cl::desc("generate autofdo textual data instead of bolt data"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + static cl::opt ReadPreAggregated("pa", cl::desc("skip perf and read data from a pre-aggregated file format"), @@ -123,7 +130,7 @@ void DataAggregator::start(StringRef PerfDataFilename) { } else { launchPerfProcess("branch events", MainEventsPPI, - "script -F pid,brstack", + "script -F pid,ip,brstack", /*Wait = */false); } @@ -323,6 +330,75 @@ void DataAggregator::parsePreAggregated() { } } +std::error_code DataAggregator::writeAutoFDOData() { + outs() << "PERF2BOLT: writing data for autofdo tools...\n"; + NamedRegionTimer T("writeAutoFDO", "Processing branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + std::error_code EC; + raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None); + if (EC) + return EC; + + // Format: + // number of unique traces + // from_1-to_1:count_1 + // from_2-to_2:count_2 + // ...... + // from_n-to_n:count_n + // number of unique sample addresses + // addr_1:count_1 + // addr_2:count_2 + // ...... + // addr_n:count_n + // number of unique LBR entries + // src_1->dst_1:count_1 + // src_2->dst_2:count_2 + // ...... + // src_n->dst_n:count_n + + const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress; + + // AutoFDO addresses are relative to the first allocated loadable program + // segment + auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t { + if (Address < FirstAllocAddress) + return 0; + return Address - FirstAllocAddress; + }; + + OutFile << FallthroughLBRs.size() << "\n"; + for (const auto &AggrLBR : FallthroughLBRs) { + auto &Trace = AggrLBR.first; + auto &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << (Info.InternCount + Info.ExternCount) << "\n"; + } + + OutFile << BasicSamples.size() << "\n"; + for (const auto &Sample : BasicSamples) { + auto PC = Sample.first; + auto HitCount = Sample.second; + OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n"; + } + + OutFile << BranchLBRs.size() << "\n"; + for (const auto &AggrLBR : BranchLBRs) { + auto &Trace = AggrLBR.first; + auto &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << Info.TakenCount << "\n"; + } + + outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, " + << BasicSamples.size() << " sample addresses and " << BranchLBRs.size() + << " unique branches to " << OutputFDataName << "\n"; + + return std::error_code(); +} + void DataAggregator::parseProfile( BinaryContext &BC, std::map &BFs) { @@ -388,6 +464,15 @@ void DataAggregator::parseProfile( errs() << "PERF2BOLT: failed to parse samples\n"; } + // We can finish early if the goal is just to generate data for autofdo + if (opts::WriteAutoFDOData) { + if (std::error_code EC = writeAutoFDOData()) { + errs() << "Error writing autofdo data to file: " << EC.message() << "\n"; + } + deleteTempFiles(); + exit(0); + } + // Special handling for memory events std::string Error; auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error); @@ -475,8 +560,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { return &FI->second; } -bool -DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) { +bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address, + uint64_t Count) { auto I = FuncsToSamples.find(Func.getNames()[0]); if (I == FuncsToSamples.end()) { bool Success; @@ -485,7 +570,7 @@ DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) { FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy()))); } - I->second.bumpCount(Address - Func.getAddress()); + I->second.bumpCount(Address - Func.getAddress(), Count); return true; } @@ -682,6 +767,16 @@ ErrorOr DataAggregator::parseBranchSample() { return Res; } + while (checkAndConsumeFS()) {} + + auto PCRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = PCRes.getError()) + return EC; + Res.PC = PCRes.get(); + + if (checkAndConsumeNewLine()) + return Res; + while (!checkAndConsumeNewLine()) { checkAndConsumeFS(); @@ -890,6 +985,9 @@ std::error_code DataAggregator::parseBranchEvents() { return EC; auto &Sample = SampleRes.get(); + if (opts::WriteAutoFDOData) + ++BasicSamples[Sample.PC]; + if (Sample.LBR.empty()) continue; @@ -1041,7 +1139,8 @@ std::error_code DataAggregator::parseBasicEvents() { if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC)) BF->setHasProfileAvailable(); - BasicSamples.emplace_back(std::move(Sample.get())); + ++BasicSamples[Sample->PC]; + EventNames.insert(Sample->EventName); } return std::error_code(); @@ -1052,17 +1151,19 @@ void DataAggregator::processBasicEvents() { NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t OutOfRangeSamples{0}; + uint64_t NumSamples{0}; for (auto &Sample : BasicSamples) { - auto *Func = getBinaryFunctionContainingAddress(Sample.PC); + const auto PC = Sample.first; + const auto HitCount = Sample.second; + NumSamples += HitCount; + auto *Func = getBinaryFunctionContainingAddress(PC); if (!Func) { - ++OutOfRangeSamples; + OutOfRangeSamples += HitCount; continue; } - doSample(*Func, Sample.PC); - EventNames.insert(Sample.EventName); + doSample(*Func, PC, HitCount); } - const auto NumSamples = BasicSamples.size(); outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index d181e7312fd6..151dc68e77ba 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -53,6 +53,7 @@ class DataAggregator : public DataReader { struct PerfBranchSample { SmallVector LBR; + uint64_t PC; }; struct PerfBasicSample { @@ -106,7 +107,7 @@ class DataAggregator : public DataReader { std::unordered_map BranchLBRs; std::unordered_map FallthroughLBRs; std::vector AggregatedLBRs; - std::vector BasicSamples; + std::unordered_map BasicSamples; std::vector MemSamples; template void clear(T& Container) { @@ -197,7 +198,7 @@ class DataAggregator : public DataReader { /// Semantic actions - parser hooks to interpret parsed perf samples /// Register a sample (non-LBR mode), i.e. a new hit at \p Address - bool doSample(BinaryFunction &Func, const uint64_t Address); + bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count); /// Register an intraprocedural branch \p Branch. bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, @@ -256,6 +257,9 @@ class DataAggregator : public DataReader { /// Process all branch events. void processBranchEvents(); + /// This member function supports generating data for AutoFDO LLVM tools. + std::error_code writeAutoFDOData(); + /// Parse the full output generated by perf script to report non-LBR samples. std::error_code parseBasicEvents(); diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 335119e30c3e..be179e482918 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -118,15 +118,15 @@ FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { return Result; } -void FuncSampleData::bumpCount(uint64_t Offset) { +void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { auto Iter = Index.find(Offset); if (Iter == Index.end()) { - Data.emplace_back(Location(true, Name, Offset), 1); + Data.emplace_back(Location(true, Name, Offset), Count); Index[Offset] = Data.size() - 1; return; } auto &SI = Data[Iter->second]; - ++SI.Hits; + SI.Hits += Count; } void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h index fe5c6a548ffd..50b901b9f5b5 100644 --- a/bolt/src/DataReader.h +++ b/bolt/src/DataReader.h @@ -285,7 +285,7 @@ struct FuncSampleData { /// Aggregation helper DenseMap Index; - void bumpCount(uint64_t Offset); + void bumpCount(uint64_t Offset, uint64_t Count); }; //===----------------------------------------------------------------------===//