forked from OSchip/llvm-project
[PDB] Hash types up front when merging types instead of using StringMap
Summary: First, StringMap uses llvm::HashString, which is only good for short identifiers and really bad for large blobs of binary data like type records. Moving to `DenseMap<StringRef, TypeIndex>` with some tricks for memory allocation fixes that. Unfortunately, that didn't buy very much performance. Profiling showed that we spend a long time during DenseMap growth rehashing existing entries. Also, in general, DenseMap is faster when the keys are small. This change takes that to the logical conclusion by introducing a small wrapper value type around a pointer to key data. The key data contains a precomputed hash, the original record data (pointer and size), and the type index, which is the "value" of our original map. This reduces the time to produce llvm-as.exe and llvm-as.pdb from ~15s on my machine to 3.5s, which is about a 4x improvement. Reviewers: zturner, inglorion, ruiu Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D33428 llvm-svn: 303665
This commit is contained in:
parent
15288da293
commit
ded38803c5
|
@ -17,7 +17,6 @@
|
|||
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringMap.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Allocator.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
|
@ -26,6 +25,8 @@ namespace llvm {
|
|||
|
||||
namespace codeview {
|
||||
|
||||
class TypeHasher;
|
||||
|
||||
class TypeSerializer : public TypeVisitorCallbacks {
|
||||
struct SubRecord {
|
||||
SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
|
||||
|
@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisitorCallbacks {
|
|||
}
|
||||
};
|
||||
|
||||
typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
|
||||
typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;
|
||||
|
||||
static constexpr uint8_t ContinuationLength = 8;
|
||||
BumpPtrAllocator &RecordStorage;
|
||||
RecordSegment CurrentSegment;
|
||||
RecordList FieldListSegments;
|
||||
MutableRecordList FieldListSegments;
|
||||
|
||||
TypeIndex LastTypeIndex;
|
||||
Optional<TypeLeafKind> TypeKind;
|
||||
Optional<TypeLeafKind> MemberKind;
|
||||
std::vector<uint8_t> RecordBuffer;
|
||||
|
@ -60,28 +60,23 @@ class TypeSerializer : public TypeVisitorCallbacks {
|
|||
BinaryStreamWriter Writer;
|
||||
TypeRecordMapping Mapping;
|
||||
|
||||
RecordList SeenRecords;
|
||||
StringMap<TypeIndex> HashedRecords;
|
||||
/// Private type record hashing implementation details are handled here.
|
||||
std::unique_ptr<TypeHasher> Hasher;
|
||||
|
||||
bool isInFieldList() const;
|
||||
TypeIndex calcNextTypeIndex() const;
|
||||
TypeIndex incrementTypeIndex();
|
||||
MutableArrayRef<uint8_t> getCurrentSubRecordData();
|
||||
MutableArrayRef<uint8_t> getCurrentRecordData();
|
||||
Error writeRecordPrefix(TypeLeafKind Kind);
|
||||
TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
|
||||
TypeIndex insertRecordBytesWithCopy(CVType &Record,
|
||||
MutableArrayRef<uint8_t> Data);
|
||||
|
||||
Expected<MutableArrayRef<uint8_t>>
|
||||
addPadding(MutableArrayRef<uint8_t> Record);
|
||||
|
||||
public:
|
||||
explicit TypeSerializer(BumpPtrAllocator &Storage);
|
||||
~TypeSerializer();
|
||||
|
||||
ArrayRef<MutableArrayRef<uint8_t>> records() const;
|
||||
TypeIndex getLastTypeIndex() const;
|
||||
TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
|
||||
ArrayRef<ArrayRef<uint8_t>> records() const;
|
||||
TypeIndex insertRecordBytes(ArrayRef<uint8_t> Record);
|
||||
Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);
|
||||
|
||||
Error visitTypeBegin(CVType &Record) override;
|
||||
|
|
|
@ -64,7 +64,7 @@ public:
|
|||
return *ExpectedIndex;
|
||||
}
|
||||
|
||||
TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
|
||||
TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
|
||||
return Serializer.insertRecordBytes(Record);
|
||||
}
|
||||
|
||||
|
@ -77,9 +77,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
ArrayRef<MutableArrayRef<uint8_t>> records() const {
|
||||
return Serializer.records();
|
||||
}
|
||||
ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
|
||||
};
|
||||
|
||||
class FieldListRecordBuilder {
|
||||
|
|
|
@ -18,7 +18,7 @@ namespace codeview {
|
|||
|
||||
class TypeTableCollection : public TypeCollection {
|
||||
public:
|
||||
explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
|
||||
explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);
|
||||
|
||||
Optional<TypeIndex> getFirst() override;
|
||||
Optional<TypeIndex> getNext(TypeIndex Prev) override;
|
||||
|
@ -33,7 +33,7 @@ private:
|
|||
bool hasCapacityFor(TypeIndex Index) const;
|
||||
void ensureTypeExists(TypeIndex Index);
|
||||
|
||||
ArrayRef<MutableArrayRef<uint8_t>> Records;
|
||||
ArrayRef<ArrayRef<uint8_t>> Records;
|
||||
TypeDatabase Database;
|
||||
};
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
|
||||
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/Support/BinaryStreamWriter.h"
|
||||
|
||||
#include <string.h>
|
||||
|
@ -16,23 +17,118 @@
|
|||
using namespace llvm;
|
||||
using namespace llvm::codeview;
|
||||
|
||||
namespace {
|
||||
struct HashedType {
|
||||
uint64_t Hash;
|
||||
const uint8_t *Data;
|
||||
unsigned Size; // FIXME: Go to uint16_t?
|
||||
TypeIndex Index;
|
||||
};
|
||||
|
||||
/// Wrapper around a poitner to a HashedType. Hash and equality operations are
|
||||
/// based on data in the pointee.
|
||||
struct HashedTypePtr {
|
||||
HashedTypePtr() = default;
|
||||
HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
|
||||
HashedType *Ptr = nullptr;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <> struct DenseMapInfo<HashedTypePtr> {
|
||||
static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
|
||||
static inline HashedTypePtr getTombstoneKey() {
|
||||
return HashedTypePtr(reinterpret_cast<HashedType *>(1));
|
||||
}
|
||||
static unsigned getHashValue(HashedTypePtr Val) {
|
||||
assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
|
||||
return Val.Ptr->Hash;
|
||||
}
|
||||
static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
|
||||
HashedType *LHS = LHSP.Ptr;
|
||||
HashedType *RHS = RHSP.Ptr;
|
||||
if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
|
||||
return LHS == RHS;
|
||||
if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
|
||||
return false;
|
||||
return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
/// Private implementation so that we don't leak our DenseMap instantiations to
|
||||
/// users.
|
||||
class llvm::codeview::TypeHasher {
|
||||
private:
|
||||
/// Storage for type record provided by the caller. Records will outlive the
|
||||
/// hasher object, so they should be allocated here.
|
||||
BumpPtrAllocator &RecordStorage;
|
||||
|
||||
/// Storage for hash keys. These only need to live as long as the hashing
|
||||
/// operation.
|
||||
BumpPtrAllocator KeyStorage;
|
||||
|
||||
/// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
|
||||
/// but DenseMap is inefficient when the keys are long (like type records)
|
||||
/// because it recomputes the hash value of every key when it grows. This
|
||||
/// value type stores the hash out of line in KeyStorage, so that table
|
||||
/// entries are small and easy to rehash.
|
||||
DenseSet<HashedTypePtr> HashedRecords;
|
||||
|
||||
SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;
|
||||
|
||||
TypeIndex NextTypeIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
|
||||
|
||||
public:
|
||||
TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
|
||||
|
||||
ArrayRef<ArrayRef<uint8_t>> records() const { return SeenRecords; }
|
||||
|
||||
/// Takes the bytes of type record, inserts them into the hash table, saves
|
||||
/// them, and returns a pointer to an identical stable type record along with
|
||||
/// its type index in the destination stream.
|
||||
TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record);
|
||||
};
|
||||
|
||||
TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record) {
|
||||
assert(Record.size() < UINT32_MAX && "Record too big");
|
||||
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
|
||||
|
||||
// Compute the hash up front so we can store it in the key.
|
||||
HashedType TempHashedType = {hash_value(Record), Record.data(),
|
||||
unsigned(Record.size()), NextTypeIndex};
|
||||
|
||||
auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
|
||||
HashedType *&Hashed = Result.first->Ptr;
|
||||
|
||||
if (Result.second) {
|
||||
// This was a new type record. We need stable storage for both the key and
|
||||
// the record. The record should outlive the hashing operation.
|
||||
Hashed = KeyStorage.Allocate<HashedType>();
|
||||
*Hashed = TempHashedType;
|
||||
|
||||
uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
|
||||
memcpy(Stable, Record.data(), Record.size());
|
||||
Hashed->Data = Stable;
|
||||
assert(Hashed->Size == Record.size());
|
||||
|
||||
// This was a new record, so increment our next type index.
|
||||
++NextTypeIndex;
|
||||
}
|
||||
|
||||
// Update the caller's copy of Record to point a stable copy.
|
||||
Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
|
||||
|
||||
if (Result.second) {
|
||||
// FIXME: Can we record these in a more efficient way?
|
||||
SeenRecords.push_back(Record);
|
||||
}
|
||||
|
||||
return TypeIndex(Hashed->Index);
|
||||
}
|
||||
|
||||
bool TypeSerializer::isInFieldList() const {
|
||||
return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
|
||||
}
|
||||
|
||||
TypeIndex TypeSerializer::calcNextTypeIndex() const {
|
||||
if (LastTypeIndex.isNoneType())
|
||||
return TypeIndex(TypeIndex::FirstNonSimpleIndex);
|
||||
else
|
||||
return TypeIndex(LastTypeIndex.getIndex() + 1);
|
||||
}
|
||||
|
||||
TypeIndex TypeSerializer::incrementTypeIndex() {
|
||||
TypeIndex Previous = LastTypeIndex;
|
||||
LastTypeIndex = calcNextTypeIndex();
|
||||
return Previous;
|
||||
}
|
||||
|
||||
MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
|
||||
assert(isInFieldList());
|
||||
return getCurrentRecordData().drop_front(CurrentSegment.length());
|
||||
|
@ -51,46 +147,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
|
|||
return Error::success();
|
||||
}
|
||||
|
||||
TypeIndex
|
||||
TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
|
||||
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
|
||||
|
||||
StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
|
||||
|
||||
TypeIndex NextTypeIndex = calcNextTypeIndex();
|
||||
auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
|
||||
if (Result.second) {
|
||||
LastTypeIndex = NextTypeIndex;
|
||||
SeenRecords.push_back(Record);
|
||||
}
|
||||
return Result.first->getValue();
|
||||
}
|
||||
|
||||
TypeIndex
|
||||
TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
|
||||
MutableArrayRef<uint8_t> Data) {
|
||||
assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
|
||||
|
||||
StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
|
||||
|
||||
// Do a two state lookup / insert so that we don't have to allocate unless
|
||||
// we're going
|
||||
// to do an insert. This is a big memory savings.
|
||||
auto Iter = HashedRecords.find(S);
|
||||
if (Iter != HashedRecords.end())
|
||||
return Iter->second;
|
||||
|
||||
LastTypeIndex = calcNextTypeIndex();
|
||||
uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
|
||||
::memcpy(Copy, Data.data(), Data.size());
|
||||
Data = MutableArrayRef<uint8_t>(Copy, Data.size());
|
||||
S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
|
||||
HashedRecords.insert(std::make_pair(S, LastTypeIndex));
|
||||
SeenRecords.push_back(Data);
|
||||
Record.RecordData = Data;
|
||||
return LastTypeIndex;
|
||||
}
|
||||
|
||||
Expected<MutableArrayRef<uint8_t>>
|
||||
TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
|
||||
uint32_t Align = Record.size() % 4;
|
||||
|
@ -109,26 +165,25 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
|
|||
}
|
||||
|
||||
TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
|
||||
: RecordStorage(Storage), LastTypeIndex(),
|
||||
RecordBuffer(MaxRecordLength * 2),
|
||||
: RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
|
||||
Stream(RecordBuffer, llvm::support::little), Writer(Stream),
|
||||
Mapping(Writer) {
|
||||
Mapping(Writer), Hasher(make_unique<TypeHasher>(Storage)) {
|
||||
// RecordBuffer needs to be able to hold enough data so that if we are 1
|
||||
// byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
|
||||
// we won't overflow.
|
||||
}
|
||||
|
||||
ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
|
||||
return SeenRecords;
|
||||
TypeSerializer::~TypeSerializer() = default;
|
||||
|
||||
ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
|
||||
return Hasher->records();
|
||||
}
|
||||
|
||||
TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
|
||||
|
||||
TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
|
||||
TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> Record) {
|
||||
assert(!TypeKind.hasValue() && "Already in a type mapping!");
|
||||
assert(Writer.getOffset() == 0 && "Stream has data already!");
|
||||
|
||||
return insertRecordBytesPrivate(Record);
|
||||
return Hasher->getOrCreateRecord(Record);
|
||||
}
|
||||
|
||||
Error TypeSerializer::visitTypeBegin(CVType &Record) {
|
||||
|
@ -163,8 +218,8 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
|
|||
Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
|
||||
|
||||
Record.Type = *TypeKind;
|
||||
TypeIndex InsertedTypeIndex =
|
||||
insertRecordBytesWithCopy(Record, ThisRecordData);
|
||||
Record.RecordData = ThisRecordData;
|
||||
TypeIndex InsertedTypeIndex = Hasher->getOrCreateRecord(Record.RecordData);
|
||||
|
||||
// Write out each additional segment in reverse order, and update each
|
||||
// record's continuation index to point to the previous one.
|
||||
|
@ -174,7 +229,7 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
|
|||
reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
|
||||
assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
|
||||
*CI = InsertedTypeIndex.getIndex();
|
||||
InsertedTypeIndex = insertRecordBytesPrivate(X);
|
||||
InsertedTypeIndex = Hasher->getOrCreateRecord(X);
|
||||
}
|
||||
|
||||
TypeKind.reset();
|
||||
|
|
|
@ -24,8 +24,7 @@ static void error(Error &&EC) {
|
|||
consumeError(std::move(EC));
|
||||
}
|
||||
|
||||
TypeTableCollection::TypeTableCollection(
|
||||
ArrayRef<MutableArrayRef<uint8_t>> Records)
|
||||
TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
|
||||
: Records(Records), Database(Records.size()) {}
|
||||
|
||||
Optional<TypeIndex> TypeTableCollection::getFirst() {
|
||||
|
|
|
@ -877,14 +877,12 @@ static void mergePdbs() {
|
|||
|
||||
auto &DestTpi = Builder.getTpiBuilder();
|
||||
auto &DestIpi = Builder.getIpiBuilder();
|
||||
MergedTpi.ForEachRecord(
|
||||
[&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
|
||||
DestTpi.addTypeRecord(Data, None);
|
||||
});
|
||||
MergedIpi.ForEachRecord(
|
||||
[&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
|
||||
DestIpi.addTypeRecord(Data, None);
|
||||
});
|
||||
MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
|
||||
DestTpi.addTypeRecord(Data, None);
|
||||
});
|
||||
MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
|
||||
DestIpi.addTypeRecord(Data, None);
|
||||
});
|
||||
|
||||
SmallString<64> OutFile(opts::merge::PdbOutputFile);
|
||||
if (OutFile.empty()) {
|
||||
|
|
Loading…
Reference in New Issue