[BOLT] Hash anonymous symbol names

Summary:
This diff replaces the addresses in all the {SYMBOLat,HOLEat,DATAat} symbols with hash values based on the data contained in the symbol.  It should make the profiling data for anonymous symbols robust to address changes.

The only small problem with this approach is that the hashed name for padding symbols of the same size collide frequently.  This shouldn't be a big deal since it would be weird if those symbols were hot.

On a test run with hhvm there were 26 collisions (out of ~338k symbols).  Most of the collisions were from small (2,4,8 byte) objects.

(cherry picked from FBD7134261)
This commit is contained in:
Bill Nell 2018-06-06 03:17:32 -07:00 committed by Maksim Panchenko
parent 779541283a
commit 706abb6c95
9 changed files with 143 additions and 18 deletions

View File

@ -123,14 +123,14 @@ void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) {
auto fixParents =
[&](BinaryDataMapType::iterator Itr, BinaryData *NewParent) {
auto *OldParent = Itr->second->Parent;
auto *OldParent = Itr->second->Parent;
Itr->second->Parent = NewParent;
++Itr;
while (Itr != BinaryDataMap.end() && OldParent &&
Itr->second->Parent == OldParent) {
Itr->second->Parent = NewParent;
++Itr;
while (Itr != BinaryDataMap.end() && OldParent &&
Itr->second->Parent == OldParent) {
Itr->second->Parent = NewParent;
++Itr;
}
}
};
// Check if the previous symbol contains the newly added symbol.
@ -225,11 +225,13 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name,
GlobalSymbols[Name] = BD;
}
updateObjectNesting(GAI);
BD = nullptr;
} else if (!GAI->second->hasName(Name)) {
GAI->second->Names.push_back(Name);
GlobalSymbols[Name] = GAI->second;
} else {
BD = nullptr;
}
BD = nullptr;
} else {
GAI = BinaryDataMap.emplace(Address, BD).first;
GlobalSymbols[Name] = BD;
@ -240,7 +242,8 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name,
auto *Symbol = Ctx->getOrCreateSymbol(Name);
if (BD) {
BD->Symbols.push_back(Symbol);
assert(BD->Symbols.size() == BD->Names.size());
assert(BD->Symbols.size() == BD->Names.size() &&
"there should be a 1:1 mapping between names and symbols");
}
return Symbol;
}
@ -298,6 +301,69 @@ bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) {
return false;
}
void BinaryContext::generateSymbolHashes() {
auto isNonAnonymousName = [](StringRef Name) {
return !(Name.startswith("SYMBOLat") ||
Name.startswith("DATAat") ||
Name.startswith("HOLEat"));
};
auto isPadding = [](const BinaryData &BD) {
auto Contents = BD.getSection().getContents();
auto SymData = Contents.substr(BD.getOffset(), BD.getSize());
return (BD.getName().startswith("HOLEat") ||
SymData.find_first_not_of(0) == StringRef::npos);
};
for (auto &Entry : BinaryDataMap) {
auto &BD = *Entry.second;
auto Name = BD.getName();
if (isNonAnonymousName(Name))
continue;
// First check if a non-anonymous alias exists and move it to the front.
if (BD.getNames().size() > 1) {
auto Itr = std::find_if(BD.Names.begin(),
BD.Names.end(),
isNonAnonymousName);
if (Itr != BD.Names.end()) {
assert(BD.Names.size() == BD.Symbols.size() &&
"there should be a 1:1 mapping between names and symbols");
auto Idx = std::distance(BD.Names.begin(), Itr);
std::swap(BD.Names[0], *Itr);
std::swap(BD.Symbols[0], BD.Symbols[Idx]);
continue;
}
}
// We have to skip 0 size symbols since they will all collide.
if (BD.getSize() == 0) {
continue;
}
const auto Hash = BD.getSection().hash(BD);
const auto Idx = Name.find("0x");
std::string NewName = (Twine(Name.substr(0, Idx)) +
"_" + Twine::utohexstr(Hash)).str();
if (getBinaryDataByName(NewName)) {
// Ignore collisions for symbols that appear to be padding
// (i.e. all zeros or a "hole")
if (!isPadding(BD)) {
outs() << "BOLT-WARNING: collision detected when hashing " << BD
<< " with new name (" << NewName << "), skipping.\n";
}
continue;
}
BD.Names.insert(BD.Names.begin(), NewName);
BD.Symbols.insert(BD.Symbols.begin(),
Ctx->getOrCreateSymbol(NewName));
assert(BD.Names.size() == BD.Symbols.size() &&
"there should be a 1:1 mapping between names and symbols");
GlobalSymbols[NewName] = &BD;
}
}
void BinaryContext::postProcessSymbolTable() {
fixBinaryDataHoles();
bool Valid = true;
@ -315,6 +381,7 @@ void BinaryContext::postProcessSymbolTable() {
}
assert(Valid);
assignMemData();
generateSymbolHashes();
}
void BinaryContext::foldFunction(BinaryFunction &ChildBF,
@ -379,7 +446,7 @@ void BinaryContext::fixBinaryDataHoles() {
while (Itr != End) {
if (Itr->second->getAddress() > EndAddress) {
auto Gap = Itr->second->getAddress() - EndAddress;
auto Gap = Itr->second->getAddress() - EndAddress;
Holes.push_back(std::make_pair(EndAddress, Gap));
}
EndAddress = Itr->second->getEndAddress();
@ -723,11 +790,11 @@ void BinaryContext::printInstruction(raw_ostream &OS,
OS << " # TAILCALL ";
if (MIB->isInvoke(Instruction)) {
if (const auto EHInfo = MIB->getEHInfo(Instruction)) {
OS << " # handler: ";
OS << " # handler: ";
if (EHInfo->first)
OS << *EHInfo->first;
else
OS << '0';
else
OS << '0';
OS << "; action: " << EHInfo->second;
}
auto GnuArgsSize = MIB->getGnuArgsSize(Instruction);

View File

@ -192,6 +192,9 @@ public:
/// symbols are padded with the space before the next BinaryData object.
void fixBinaryDataHoles();
/// Generate names based on data hashes for unknown symbols.
void generateSymbolHashes();
/// Populate \p GlobalMemData. This should be done after all symbol discovery
/// is complete, e.g. after building CFGs for all functions.
void assignMemData();

View File

@ -32,6 +32,10 @@ PrintSymbolAliases("print-aliases",
cl::cat(BoltCategory));
}
bool BinaryData::isAbsolute() const {
return Flags & SymbolRef::SF_Absolute;
}
bool BinaryData::isMoveable() const {
return (!isAbsolute() &&
(IsMoveable &&

View File

@ -23,7 +23,7 @@
namespace llvm {
namespace bolt {
struct BinarySection;
class BinarySection;
/// \p BinaryData represents an indivisible part of a data section section.
/// BinaryData's may contain sub-components, e.g. jump tables but they are
@ -106,7 +106,7 @@ public:
bool isAtomic() const {
return isTopLevelJumpTable() || !Parent;
}
iterator_range<std::vector<std::string>::const_iterator> names() const {
return make_range(Names.begin(), Names.end());
}
@ -140,7 +140,7 @@ public:
return std::find(Symbols.begin(), Symbols.end(), Symbol) != Symbols.end();
}
bool isAbsolute() const { return getSymbol()->isAbsolute(); }
bool isAbsolute() const;
bool isMoveable() const;
uint64_t getAddress() const { return Address; }

View File

@ -23,6 +23,44 @@ namespace opts {
extern cl::opt<bool> PrintRelocations;
}
uint64_t
BinarySection::hash(const BinaryData &BD,
std::map<const BinaryData *, uint64_t> &Cache) const {
auto Itr = Cache.find(&BD);
if (Itr != Cache.end())
return Itr->second;
Cache[&BD] = 0;
auto Offset = BD.getAddress() - getAddress();
const auto EndOffset = BD.getEndAddress() - getAddress();
auto Begin = Relocations.lower_bound(Relocation{Offset, 0, 0, 0, 0});
auto End = Relocations.upper_bound(Relocation{EndOffset, 0, 0, 0, 0});
const auto Contents = getContents();
hash_code Hash = hash_combine(hash_value(BD.getSize()),
hash_value(BD.getSectionName()));
while (Begin != End) {
const auto &Rel = *Begin++;
Hash = hash_combine(
Hash,
hash_value(Contents.substr(Offset, Begin->Offset - Offset)));
if (auto *RelBD = BC.getBinaryDataByName(Rel.Symbol->getName())) {
Hash = hash_combine(Hash, hash(*RelBD, Cache));
}
Offset = Rel.Offset + Rel.getSize();
}
Hash = hash_combine(
Hash,
hash_value(Contents.substr(Offset, EndOffset - Offset)));
Cache[&BD] = Hash;
return Hash;
}
BinarySection::~BinarySection() {
if (isReordered()) {
delete[] getData();

View File

@ -22,6 +22,7 @@
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/raw_ostream.h"
#include <set>
#include <map>
namespace llvm {
@ -69,6 +70,9 @@ class BinarySection {
// Set by ExecutableFileMemoryManager.
mutable bool IsReordered{false}; // Have the contents been reordered?
uint64_t hash(const BinaryData &BD,
std::map<const BinaryData *, uint64_t> &Cache) const;
// non-copyable
BinarySection(const BinarySection &) = delete;
BinarySection(BinarySection &&) = delete;
@ -340,6 +344,11 @@ public:
return Itr != Relocations.end() ? &*Itr : nullptr;
}
uint64_t hash(const BinaryData &BD) const {
std::map<const BinaryData *, uint64_t> Cache;
return hash(BD, Cache);
}
///
/// Property accessors related to output data.
///

View File

@ -44,6 +44,9 @@ struct Relocation {
/// Return size of the given relocation \p Type.
static size_t getSizeForType(uint64_t Type);
/// Return size of this relocation.
size_t getSize() const { return getSizeForType(Type); }
/// Extract current relocated value from binary contents. This is used for
/// RISC architectures where values are encoded in specific bits depending
/// on the relocation value.

View File

@ -2573,6 +2573,8 @@ void RewriteInstance::disassembleFunctions() {
Function.print(outs(), "while building cfg", true);
} // Iterate over all functions
BC->postProcessSymbolTable();
}
void RewriteInstance::postProcessFunctions() {
@ -2601,8 +2603,6 @@ void RewriteInstance::postProcessFunctions() {
BC->SumExecutionCount += Function.getKnownExecutionCount();
}
BC->postProcessSymbolTable();
if (opts::PrintGlobals) {
outs() << "BOLT-INFO: Global symbols:\n";
BC->printGlobalSymbols(outs());

View File

@ -306,7 +306,8 @@ int main(int argc, char **argv) {
// For consistency, sort functions by their IDs.
std::sort(MergedProfile.Functions.begin(), MergedProfile.Functions.end(),
[] (BinaryFunctionProfile &A, BinaryFunctionProfile &B) {
[] (const BinaryFunctionProfile &A,
const BinaryFunctionProfile &B) {
return A.Id < B.Id;
});