[perf2bolt] Add support for generating autofdo input

Summary:
Autofdo tools support.

(cherry picked from FBD13779026)
This commit is contained in:
Rafael Auler 2019-01-22 17:21:45 -08:00 committed by Maksim Panchenko
parent c6ce2abb7d
commit af81c7ff80
4 changed files with 121 additions and 16 deletions

View File

@ -45,6 +45,13 @@ BasicAggregation("nl",
cl::ZeroOrMore,
cl::cat(AggregatorCategory));
static cl::opt<bool>
WriteAutoFDOData("autofdo",
cl::desc("generate autofdo textual data instead of bolt data"),
cl::init(false),
cl::ZeroOrMore,
cl::cat(AggregatorCategory));
static cl::opt<bool>
ReadPreAggregated("pa",
cl::desc("skip perf and read data from a pre-aggregated file format"),
@ -123,7 +130,7 @@ void DataAggregator::start(StringRef PerfDataFilename) {
} else {
launchPerfProcess("branch events",
MainEventsPPI,
"script -F pid,brstack",
"script -F pid,ip,brstack",
/*Wait = */false);
}
@ -323,6 +330,75 @@ void DataAggregator::parsePreAggregated() {
}
}
std::error_code DataAggregator::writeAutoFDOData() {
outs() << "PERF2BOLT: writing data for autofdo tools...\n";
NamedRegionTimer T("writeAutoFDO", "Processing branch events",
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
std::error_code EC;
raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None);
if (EC)
return EC;
// Format:
// number of unique traces
// from_1-to_1:count_1
// from_2-to_2:count_2
// ......
// from_n-to_n:count_n
// number of unique sample addresses
// addr_1:count_1
// addr_2:count_2
// ......
// addr_n:count_n
// number of unique LBR entries
// src_1->dst_1:count_1
// src_2->dst_2:count_2
// ......
// src_n->dst_n:count_n
const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress;
// AutoFDO addresses are relative to the first allocated loadable program
// segment
auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t {
if (Address < FirstAllocAddress)
return 0;
return Address - FirstAllocAddress;
};
OutFile << FallthroughLBRs.size() << "\n";
for (const auto &AggrLBR : FallthroughLBRs) {
auto &Trace = AggrLBR.first;
auto &Info = AggrLBR.second;
OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-"
<< Twine::utohexstr(filterAddress(Trace.To)) << ":"
<< (Info.InternCount + Info.ExternCount) << "\n";
}
OutFile << BasicSamples.size() << "\n";
for (const auto &Sample : BasicSamples) {
auto PC = Sample.first;
auto HitCount = Sample.second;
OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n";
}
OutFile << BranchLBRs.size() << "\n";
for (const auto &AggrLBR : BranchLBRs) {
auto &Trace = AggrLBR.first;
auto &Info = AggrLBR.second;
OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->"
<< Twine::utohexstr(filterAddress(Trace.To)) << ":"
<< Info.TakenCount << "\n";
}
outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, "
<< BasicSamples.size() << " sample addresses and " << BranchLBRs.size()
<< " unique branches to " << OutputFDataName << "\n";
return std::error_code();
}
void DataAggregator::parseProfile(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs) {
@ -388,6 +464,15 @@ void DataAggregator::parseProfile(
errs() << "PERF2BOLT: failed to parse samples\n";
}
// We can finish early if the goal is just to generate data for autofdo
if (opts::WriteAutoFDOData) {
if (std::error_code EC = writeAutoFDOData()) {
errs() << "Error writing autofdo data to file: " << EC.message() << "\n";
}
deleteTempFiles();
exit(0);
}
// Special handling for memory events
std::string Error;
auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error);
@ -475,8 +560,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
return &FI->second;
}
bool
DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) {
bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
uint64_t Count) {
auto I = FuncsToSamples.find(Func.getNames()[0]);
if (I == FuncsToSamples.end()) {
bool Success;
@ -485,7 +570,7 @@ DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) {
FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy())));
}
I->second.bumpCount(Address - Func.getAddress());
I->second.bumpCount(Address - Func.getAddress(), Count);
return true;
}
@ -682,6 +767,16 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
return Res;
}
while (checkAndConsumeFS()) {}
auto PCRes = parseHexField(FieldSeparator, true);
if (std::error_code EC = PCRes.getError())
return EC;
Res.PC = PCRes.get();
if (checkAndConsumeNewLine())
return Res;
while (!checkAndConsumeNewLine()) {
checkAndConsumeFS();
@ -890,6 +985,9 @@ std::error_code DataAggregator::parseBranchEvents() {
return EC;
auto &Sample = SampleRes.get();
if (opts::WriteAutoFDOData)
++BasicSamples[Sample.PC];
if (Sample.LBR.empty())
continue;
@ -1041,7 +1139,8 @@ std::error_code DataAggregator::parseBasicEvents() {
if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC))
BF->setHasProfileAvailable();
BasicSamples.emplace_back(std::move(Sample.get()));
++BasicSamples[Sample->PC];
EventNames.insert(Sample->EventName);
}
return std::error_code();
@ -1052,17 +1151,19 @@ void DataAggregator::processBasicEvents() {
NamedRegionTimer T("processBasic", "Processing basic events",
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
uint64_t OutOfRangeSamples{0};
uint64_t NumSamples{0};
for (auto &Sample : BasicSamples) {
auto *Func = getBinaryFunctionContainingAddress(Sample.PC);
const auto PC = Sample.first;
const auto HitCount = Sample.second;
NumSamples += HitCount;
auto *Func = getBinaryFunctionContainingAddress(PC);
if (!Func) {
++OutOfRangeSamples;
OutOfRangeSamples += HitCount;
continue;
}
doSample(*Func, Sample.PC);
EventNames.insert(Sample.EventName);
doSample(*Func, PC, HitCount);
}
const auto NumSamples = BasicSamples.size();
outs() << "PERF2BOLT: read " << NumSamples << " samples\n";
outs() << "PERF2BOLT: out of range samples recorded in unknown regions: "

View File

@ -53,6 +53,7 @@ class DataAggregator : public DataReader {
struct PerfBranchSample {
SmallVector<LBREntry, 16> LBR;
uint64_t PC;
};
struct PerfBasicSample {
@ -106,7 +107,7 @@ class DataAggregator : public DataReader {
std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
std::vector<AggregatedLBREntry> AggregatedLBRs;
std::vector<PerfBasicSample> BasicSamples;
std::unordered_map<uint64_t, uint64_t> BasicSamples;
std::vector<PerfMemSample> MemSamples;
template<typename T> void clear(T& Container) {
@ -197,7 +198,7 @@ class DataAggregator : public DataReader {
/// Semantic actions - parser hooks to interpret parsed perf samples
/// Register a sample (non-LBR mode), i.e. a new hit at \p Address
bool doSample(BinaryFunction &Func, const uint64_t Address);
bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
/// Register an intraprocedural branch \p Branch.
bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
@ -256,6 +257,9 @@ class DataAggregator : public DataReader {
/// Process all branch events.
void processBranchEvents();
/// This member function supports generating data for AutoFDO LLVM tools.
std::error_code writeAutoFDOData();
/// Parse the full output generated by perf script to report non-LBR samples.
std::error_code parseBasicEvents();

View File

@ -118,15 +118,15 @@ FuncSampleData::getSamples(uint64_t Start, uint64_t End) const {
return Result;
}
void FuncSampleData::bumpCount(uint64_t Offset) {
void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
auto Iter = Index.find(Offset);
if (Iter == Index.end()) {
Data.emplace_back(Location(true, Name, Offset), 1);
Data.emplace_back(Location(true, Name, Offset), Count);
Index[Offset] = Data.size() - 1;
return;
}
auto &SI = Data[Iter->second];
++SI.Hits;
SI.Hits += Count;
}
void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo,

View File

@ -285,7 +285,7 @@ struct FuncSampleData {
/// Aggregation helper
DenseMap<uint64_t, size_t> Index;
void bumpCount(uint64_t Offset);
void bumpCount(uint64_t Offset, uint64_t Count);
};
//===----------------------------------------------------------------------===//