[PDB] Hash types up front when merging types instead of using StringMap

Summary:
First, StringMap uses llvm::HashString, which is only good for short
identifiers and really bad for large blobs of binary data like type
records. Moving to `DenseMap<StringRef, TypeIndex>` with some tricks for
memory allocation fixes that.

Unfortunately, that didn't buy very much performance. Profiling showed
that we spend a long time during DenseMap growth rehashing existing
entries. Also, in general, DenseMap is faster when the keys are small.
This change takes that to the logical conclusion by introducing a small
wrapper value type around a pointer to key data. The key data contains a
precomputed hash, the original record data (pointer and size), and the
type index, which is the "value" of our original map.

This reduces the time to produce llvm-as.exe and llvm-as.pdb from ~15s
on my machine to 3.5s, which is about a 4x improvement.

Reviewers: zturner, inglorion, ruiu

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D33428

llvm-svn: 303665
This commit is contained in:
Reid Kleckner 2017-05-23 18:23:59 +00:00
parent 15288da293
commit ded38803c5
6 changed files with 140 additions and 95 deletions

View File

@ -17,7 +17,6 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Error.h"
@ -26,6 +25,8 @@ namespace llvm {
namespace codeview {
class TypeHasher;
class TypeSerializer : public TypeVisitorCallbacks {
struct SubRecord {
SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisitorCallbacks {
}
};
typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;
static constexpr uint8_t ContinuationLength = 8;
BumpPtrAllocator &RecordStorage;
RecordSegment CurrentSegment;
RecordList FieldListSegments;
MutableRecordList FieldListSegments;
TypeIndex LastTypeIndex;
Optional<TypeLeafKind> TypeKind;
Optional<TypeLeafKind> MemberKind;
std::vector<uint8_t> RecordBuffer;
@ -60,28 +60,23 @@ class TypeSerializer : public TypeVisitorCallbacks {
BinaryStreamWriter Writer;
TypeRecordMapping Mapping;
RecordList SeenRecords;
StringMap<TypeIndex> HashedRecords;
/// Private type record hashing implementation details are handled here.
std::unique_ptr<TypeHasher> Hasher;
bool isInFieldList() const;
TypeIndex calcNextTypeIndex() const;
TypeIndex incrementTypeIndex();
MutableArrayRef<uint8_t> getCurrentSubRecordData();
MutableArrayRef<uint8_t> getCurrentRecordData();
Error writeRecordPrefix(TypeLeafKind Kind);
TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
TypeIndex insertRecordBytesWithCopy(CVType &Record,
MutableArrayRef<uint8_t> Data);
Expected<MutableArrayRef<uint8_t>>
addPadding(MutableArrayRef<uint8_t> Record);
public:
explicit TypeSerializer(BumpPtrAllocator &Storage);
~TypeSerializer();
ArrayRef<MutableArrayRef<uint8_t>> records() const;
TypeIndex getLastTypeIndex() const;
TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
ArrayRef<ArrayRef<uint8_t>> records() const;
TypeIndex insertRecordBytes(ArrayRef<uint8_t> Record);
Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);
Error visitTypeBegin(CVType &Record) override;

View File

@ -64,7 +64,7 @@ public:
return *ExpectedIndex;
}
TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
return Serializer.insertRecordBytes(Record);
}
@ -77,9 +77,7 @@ public:
}
}
ArrayRef<MutableArrayRef<uint8_t>> records() const {
return Serializer.records();
}
ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
};
class FieldListRecordBuilder {

View File

@ -18,7 +18,7 @@ namespace codeview {
class TypeTableCollection : public TypeCollection {
public:
explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);
Optional<TypeIndex> getFirst() override;
Optional<TypeIndex> getNext(TypeIndex Prev) override;
@ -33,7 +33,7 @@ private:
bool hasCapacityFor(TypeIndex Index) const;
void ensureTypeExists(TypeIndex Index);
ArrayRef<MutableArrayRef<uint8_t>> Records;
ArrayRef<ArrayRef<uint8_t>> Records;
TypeDatabase Database;
};
}

View File

@ -9,6 +9,7 @@
#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/BinaryStreamWriter.h"
#include <string.h>
@ -16,23 +17,118 @@
using namespace llvm;
using namespace llvm::codeview;
namespace {
struct HashedType {
uint64_t Hash;
const uint8_t *Data;
unsigned Size; // FIXME: Go to uint16_t?
TypeIndex Index;
};
/// Wrapper around a poitner to a HashedType. Hash and equality operations are
/// based on data in the pointee.
struct HashedTypePtr {
HashedTypePtr() = default;
HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
HashedType *Ptr = nullptr;
};
} // namespace
template <> struct DenseMapInfo<HashedTypePtr> {
static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
static inline HashedTypePtr getTombstoneKey() {
return HashedTypePtr(reinterpret_cast<HashedType *>(1));
}
static unsigned getHashValue(HashedTypePtr Val) {
assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
return Val.Ptr->Hash;
}
static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
HashedType *LHS = LHSP.Ptr;
HashedType *RHS = RHSP.Ptr;
if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
return LHS == RHS;
if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
return false;
return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
}
};
/// Private implementation so that we don't leak our DenseMap instantiations to
/// users.
class llvm::codeview::TypeHasher {
private:
/// Storage for type record provided by the caller. Records will outlive the
/// hasher object, so they should be allocated here.
BumpPtrAllocator &RecordStorage;
/// Storage for hash keys. These only need to live as long as the hashing
/// operation.
BumpPtrAllocator KeyStorage;
/// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
/// but DenseMap is inefficient when the keys are long (like type records)
/// because it recomputes the hash value of every key when it grows. This
/// value type stores the hash out of line in KeyStorage, so that table
/// entries are small and easy to rehash.
DenseSet<HashedTypePtr> HashedRecords;
SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;
TypeIndex NextTypeIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
public:
TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
ArrayRef<ArrayRef<uint8_t>> records() const { return SeenRecords; }
/// Takes the bytes of type record, inserts them into the hash table, saves
/// them, and returns a pointer to an identical stable type record along with
/// its type index in the destination stream.
TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record);
};
TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record) {
assert(Record.size() < UINT32_MAX && "Record too big");
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
// Compute the hash up front so we can store it in the key.
HashedType TempHashedType = {hash_value(Record), Record.data(),
unsigned(Record.size()), NextTypeIndex};
auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
HashedType *&Hashed = Result.first->Ptr;
if (Result.second) {
// This was a new type record. We need stable storage for both the key and
// the record. The record should outlive the hashing operation.
Hashed = KeyStorage.Allocate<HashedType>();
*Hashed = TempHashedType;
uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
memcpy(Stable, Record.data(), Record.size());
Hashed->Data = Stable;
assert(Hashed->Size == Record.size());
// This was a new record, so increment our next type index.
++NextTypeIndex;
}
// Update the caller's copy of Record to point a stable copy.
Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
if (Result.second) {
// FIXME: Can we record these in a more efficient way?
SeenRecords.push_back(Record);
}
return TypeIndex(Hashed->Index);
}
bool TypeSerializer::isInFieldList() const {
return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
}
TypeIndex TypeSerializer::calcNextTypeIndex() const {
if (LastTypeIndex.isNoneType())
return TypeIndex(TypeIndex::FirstNonSimpleIndex);
else
return TypeIndex(LastTypeIndex.getIndex() + 1);
}
TypeIndex TypeSerializer::incrementTypeIndex() {
TypeIndex Previous = LastTypeIndex;
LastTypeIndex = calcNextTypeIndex();
return Previous;
}
MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
assert(isInFieldList());
return getCurrentRecordData().drop_front(CurrentSegment.length());
@ -51,46 +147,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
return Error::success();
}
TypeIndex
TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
TypeIndex NextTypeIndex = calcNextTypeIndex();
auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
if (Result.second) {
LastTypeIndex = NextTypeIndex;
SeenRecords.push_back(Record);
}
return Result.first->getValue();
}
TypeIndex
TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
MutableArrayRef<uint8_t> Data) {
assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
// Do a two state lookup / insert so that we don't have to allocate unless
// we're going
// to do an insert. This is a big memory savings.
auto Iter = HashedRecords.find(S);
if (Iter != HashedRecords.end())
return Iter->second;
LastTypeIndex = calcNextTypeIndex();
uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
::memcpy(Copy, Data.data(), Data.size());
Data = MutableArrayRef<uint8_t>(Copy, Data.size());
S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
HashedRecords.insert(std::make_pair(S, LastTypeIndex));
SeenRecords.push_back(Data);
Record.RecordData = Data;
return LastTypeIndex;
}
Expected<MutableArrayRef<uint8_t>>
TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
uint32_t Align = Record.size() % 4;
@ -109,26 +165,25 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
}
TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
: RecordStorage(Storage), LastTypeIndex(),
RecordBuffer(MaxRecordLength * 2),
: RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
Stream(RecordBuffer, llvm::support::little), Writer(Stream),
Mapping(Writer) {
Mapping(Writer), Hasher(make_unique<TypeHasher>(Storage)) {
// RecordBuffer needs to be able to hold enough data so that if we are 1
// byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
// we won't overflow.
}
ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
return SeenRecords;
TypeSerializer::~TypeSerializer() = default;
ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
return Hasher->records();
}
TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> Record) {
assert(!TypeKind.hasValue() && "Already in a type mapping!");
assert(Writer.getOffset() == 0 && "Stream has data already!");
return insertRecordBytesPrivate(Record);
return Hasher->getOrCreateRecord(Record);
}
Error TypeSerializer::visitTypeBegin(CVType &Record) {
@ -163,8 +218,8 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
Record.Type = *TypeKind;
TypeIndex InsertedTypeIndex =
insertRecordBytesWithCopy(Record, ThisRecordData);
Record.RecordData = ThisRecordData;
TypeIndex InsertedTypeIndex = Hasher->getOrCreateRecord(Record.RecordData);
// Write out each additional segment in reverse order, and update each
// record's continuation index to point to the previous one.
@ -174,7 +229,7 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
*CI = InsertedTypeIndex.getIndex();
InsertedTypeIndex = insertRecordBytesPrivate(X);
InsertedTypeIndex = Hasher->getOrCreateRecord(X);
}
TypeKind.reset();

View File

@ -24,8 +24,7 @@ static void error(Error &&EC) {
consumeError(std::move(EC));
}
TypeTableCollection::TypeTableCollection(
ArrayRef<MutableArrayRef<uint8_t>> Records)
TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
: Records(Records), Database(Records.size()) {}
Optional<TypeIndex> TypeTableCollection::getFirst() {

View File

@ -877,14 +877,12 @@ static void mergePdbs() {
auto &DestTpi = Builder.getTpiBuilder();
auto &DestIpi = Builder.getIpiBuilder();
MergedTpi.ForEachRecord(
[&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
DestTpi.addTypeRecord(Data, None);
});
MergedIpi.ForEachRecord(
[&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
DestIpi.addTypeRecord(Data, None);
});
MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
DestTpi.addTypeRecord(Data, None);
});
MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
DestIpi.addTypeRecord(Data, None);
});
SmallString<64> OutFile(opts::merge::PdbOutputFile);
if (OutFile.empty()) {