forked from OSchip/llvm-project
[BOLT] Fix stale functions when using BAT
Summary: If collecting data in Intel Skylake machines, we may face a bug where LBR0 or LBR1 may be duplicated w.r.t. the next entry. This makes perf2bolt interpret it as an invalid trace, which ordinarily we discard during aggregation. However, in BAT, since we do not disassemble the binary where the collection happened but rely only on the translation table, it is not possible to detect bad traces and discard them. This gets to the fdata file, and this invalid trace ends up invalidating the profile for the whole function (by being treated as stale by BOLT). In this patch, we detect Skylake by looking for LBRs with 32 entries, and discard the first 2 entries to avoid running into this problem. It also fixes an issue with collision in the translation map by prioritizing the last basic block when more than one share the same output address. (cherry picked from FBD17996791)
This commit is contained in:
parent
103b0a77cc
commit
b807641e2a
|
@ -33,7 +33,14 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
|
||||||
DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
|
DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
|
||||||
DEBUG(dbgs() << " Key: " << Twine::utohexstr(Key)
|
DEBUG(dbgs() << " Key: " << Twine::utohexstr(Key)
|
||||||
<< " Val: " << Twine::utohexstr(Val) << "\n");
|
<< " Val: " << Twine::utohexstr(Val) << "\n");
|
||||||
Map.insert(std::pair<uint32_t, uint32_t>(Key, Val));
|
// In case of conflicts (same Key mapping to different Vals), the last
|
||||||
|
// update takes precedence. Of course it is not ideal to have conflicts and
|
||||||
|
// those happen when we have an empty BB that either contained only
|
||||||
|
// NOPs or a jump to the next block (successor). Either way, the successor
|
||||||
|
// and this deleted block will both share the same output address (the same key),
|
||||||
|
// and we need to map back. We choose here to privilege the successor by
|
||||||
|
// allowing it to overwrite the previously inserted key in the map.
|
||||||
|
Map[Key] = Val;
|
||||||
|
|
||||||
// Look for special instructions we are interested in mapping offsets. These
|
// Look for special instructions we are interested in mapping offsets. These
|
||||||
// are key instructions for the profile identified by
|
// are key instructions for the profile identified by
|
||||||
|
|
|
@ -1204,6 +1204,7 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||||
uint64_t NumSamples{0};
|
uint64_t NumSamples{0};
|
||||||
uint64_t NumSamplesNoLBR{0};
|
uint64_t NumSamplesNoLBR{0};
|
||||||
uint64_t NumTraces{0};
|
uint64_t NumTraces{0};
|
||||||
|
bool NeedsSkylakeFix{false};
|
||||||
|
|
||||||
while (hasData()) {
|
while (hasData()) {
|
||||||
++NumTotalSamples;
|
++NumTotalSamples;
|
||||||
|
@ -1226,11 +1227,25 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||||
}
|
}
|
||||||
|
|
||||||
NumEntries += Sample.LBR.size();
|
NumEntries += Sample.LBR.size();
|
||||||
|
if (BAT && NumEntries == 32 && !NeedsSkylakeFix) {
|
||||||
|
outs() << "BOLT-WARNING: Using Intel Skylake bug workaround\n";
|
||||||
|
NeedsSkylakeFix = true;
|
||||||
|
}
|
||||||
|
|
||||||
// LBRs are stored in reverse execution order. NextPC refers to the next
|
// LBRs are stored in reverse execution order. NextPC refers to the next
|
||||||
// recorded executed PC.
|
// recorded executed PC.
|
||||||
uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
|
uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
|
||||||
|
uint32_t NumEntry{0};
|
||||||
for (const auto &LBR : Sample.LBR) {
|
for (const auto &LBR : Sample.LBR) {
|
||||||
|
++NumEntry;
|
||||||
|
// Hardware bug workaround: Intel Skylake (which has 32 LBR entries)
|
||||||
|
// sometimes record entry 32 as an exact copy of entry 31. This will cause
|
||||||
|
// us to likely record an invalid trace and generate a stale function for
|
||||||
|
// BAT mode (non BAT disassembles the function and is able to ignore this
|
||||||
|
// trace at aggregation time). Drop first 2 entries (last two, in
|
||||||
|
// chronological order)
|
||||||
|
if (NeedsSkylakeFix && NumEntry <= 2)
|
||||||
|
continue;
|
||||||
if (NextPC) {
|
if (NextPC) {
|
||||||
// Record fall-through trace.
|
// Record fall-through trace.
|
||||||
const auto TraceFrom = LBR.To;
|
const auto TraceFrom = LBR.To;
|
||||||
|
|
|
@ -53,7 +53,7 @@ class BoltAddressTranslation;
|
||||||
class DataAggregator : public DataReader {
|
class DataAggregator : public DataReader {
|
||||||
|
|
||||||
struct PerfBranchSample {
|
struct PerfBranchSample {
|
||||||
SmallVector<LBREntry, 16> LBR;
|
SmallVector<LBREntry, 32> LBR;
|
||||||
uint64_t PC;
|
uint64_t PC;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue