[CodeView] Add support for content hashing CodeView type records.

Currently nothing uses this, but this at least gets the core algorithm in, and adds some test to demonstrate correctness. Differential Revision: https://reviews.llvm.org/D40736 llvm-svn: 319854
2017-12-05 23:08:58 +00:00 · 2017-12-05 23:08:58 +00:00 · 87b78e9d1e
parent 0328d0083e
commit 87b78e9d1e
8 changed files with 366 additions and 46 deletions
--- a/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
+#include "llvm/DebugInfo/CodeView/TypeHashing.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Allocator.h"
 #include <cassert>
@ -27,13 +28,6 @@ namespace llvm {
 namespace codeview {

 class ContinuationRecordBuilder;
-class TypeHasher;
-
-struct HashedType {
-  hash_code Hash;
-  ArrayRef<uint8_t> Data;
-  TypeIndex Index;
-};

 class MergingTypeTableBuilder : public TypeCollection {
  /// Storage for records.  These need to outlive the TypeTableBuilder.
@ -45,14 +39,11 @@ class MergingTypeTableBuilder : public TypeCollection {
  SimpleTypeSerializer SimpleSerializer;

  /// Hash table.
-  DenseSet<HashedType> HashedRecords;
+  DenseMap<LocallyHashedType, TypeIndex> HashedRecords;

  /// Contains a list of all records indexed by TypeIndex.toArrayIndex().
  SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;

-  /// Contains a list of all hash codes index by TypeIndex.toArrayIndex().
-  SmallVector<hash_code, 2> SeenHashes;
-
 public:
  explicit MergingTypeTableBuilder(BumpPtrAllocator &Storage);
  ~MergingTypeTableBuilder();
@ -73,7 +64,6 @@ public:
  BumpPtrAllocator &getAllocator() { return RecordStorage; }

  ArrayRef<ArrayRef<uint8_t>> records() const;
-  ArrayRef<hash_code> hashes() const;

  TypeIndex insertRecordAs(hash_code Hash, ArrayRef<uint8_t> &Record);
  TypeIndex insertRecordBytes(ArrayRef<uint8_t> &Record);
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
@ -0,0 +1,119 @@
+//===- TypeHashing.h ---------------------------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+
+namespace llvm {
+namespace codeview {
+
+/// A locally hashed type represents a straightforward hash code of a serialized
+/// record.  The record is simply serialized, and then the bytes are hashed by
+/// a standard algorithm.  This is sufficient for the case of de-duplicating
+/// records within a single sequence of types, because if two records both have
+/// a back-reference to the same type in the same stream, they will both have
+/// the same numeric value for the TypeIndex of the back reference.
+struct LocallyHashedType {
+  hash_code Hash;
+  ArrayRef<uint8_t> RecordData;
+
+  static LocallyHashedType hashType(ArrayRef<uint8_t> RecordData);
+};
+
+/// A globally hashed type represents a hash value that is sufficient to
+/// uniquely identify a record across multiple type streams or type sequences.
+/// This works by, for any given record A which references B, replacing the
+/// TypeIndex that refers to B with a previously-computed global hash for B.  As
+/// this is a recursive algorithm (e.g. the global hash of B also depends on the
+/// global hashes of the types that B refers to), a global hash can uniquely
+/// identify identify that A occurs in another stream that has a completely
+/// different graph structure.  Although the hash itself is slower to compute,
+/// probing is much faster with a globally hashed type, because the hash itself
+/// is considered "as good as" the original type.  Since type records can be
+/// quite large, this makes the equality comparison of the hash much faster than
+/// equality comparison of a full record.
+struct GloballyHashedType {
+  GloballyHashedType() = default;
+  GloballyHashedType(StringRef H)
+      : GloballyHashedType(ArrayRef<uint8_t>(H.bytes_begin(), H.bytes_end())) {}
+  GloballyHashedType(ArrayRef<uint8_t> H) {
+    assert(H.size() == 20);
+    ::memcpy(Hash.data(), H.data(), 20);
+  }
+  std::array<uint8_t, 20> Hash;
+
+  /// Given a sequence of bytes representing a record, compute a global hash for
+  /// this record.  Due to the nature of global hashes incorporating the hashes
+  /// of referenced records, this function requires a list of types and ids
+  /// that RecordData might reference, indexable by TypeIndex.
+  static GloballyHashedType hashType(ArrayRef<uint8_t> RecordData,
+                                     ArrayRef<GloballyHashedType> PreviousTypes,
+                                     ArrayRef<GloballyHashedType> PreviousIds);
+
+  /// Given a sequence of combined type and ID records, compute global hashes
+  /// for each of them, returning the results in a vector of hashed types.
+  template <typename Range>
+  static std::vector<GloballyHashedType> hashTypes(Range &&Records) {
+    std::vector<GloballyHashedType> Hashes;
+    Hashes.reserve(std::distance(std::begin(Records), std::end(Records)));
+    for (const auto &R : Records)
+      Hashes.push_back(hashType(R, Hashes, Hashes));
+
+    return Hashes;
+  }
+};
+} // namespace codeview
+
+template <> struct DenseMapInfo<codeview::LocallyHashedType> {
+  static codeview::LocallyHashedType Empty;
+  static codeview::LocallyHashedType Tombstone;
+
+  static codeview::LocallyHashedType getEmptyKey() { return Empty; }
+
+  static codeview::LocallyHashedType getTombstoneKey() { return Tombstone; }
+
+  static unsigned getHashValue(codeview::LocallyHashedType Val) {
+    return Val.Hash;
+  }
+
+  static bool isEqual(codeview::LocallyHashedType LHS,
+                      codeview::LocallyHashedType RHS) {
+    if (LHS.Hash != RHS.Hash)
+      return false;
+    return LHS.RecordData == RHS.RecordData;
+  }
+};
+
+template <> struct DenseMapInfo<codeview::GloballyHashedType> {
+  static codeview::GloballyHashedType Empty;
+  static codeview::GloballyHashedType Tombstone;
+
+  static codeview::GloballyHashedType getEmptyKey() { return Empty; }
+
+  static codeview::GloballyHashedType getTombstoneKey() { return Tombstone; }
+
+  static unsigned getHashValue(codeview::GloballyHashedType Val) {
+    return *reinterpret_cast<const unsigned *>(Val.Hash.data());
+  }
+
+  static bool isEqual(codeview::GloballyHashedType LHS,
+                      codeview::GloballyHashedType RHS) {
+    return LHS.Hash == RHS.Hash;
+  }
+};
+
+} // namespace llvm
+
+#endif
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
@ -334,6 +334,11 @@ public:
  uint32_t Attrs;
  Optional<MemberPointerInfo> MemberInfo;

+  void setAttrs(PointerKind PK, PointerMode PM, PointerOptions PO,
+                uint8_t Size) {
+    Attrs = calcAttrs(PK, PM, PO, Size);
+  }
+
 private:
  static uint32_t calcAttrs(PointerKind PK, PointerMode PM, PointerOptions PO,
                            uint8_t Size) {
--- a/llvm/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/CodeView/CMakeLists.txt
@ -32,6 +32,7 @@ add_llvm_library(LLVMDebugInfoCodeView
  TypeDumpVisitor.cpp
  TypeIndex.cpp
  TypeIndexDiscovery.cpp
+  TypeHashing.cpp
  TypeRecordMapping.cpp
  TypeStreamMerger.cpp
  TypeTableCollection.cpp
--- a/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
@ -28,27 +28,6 @@
 using namespace llvm;
 using namespace llvm::codeview;

-static HashedType Empty{0, {}, TypeIndex::None()};
-static HashedType Tombstone{hash_code(-1), {}, TypeIndex::None()};
-
-namespace llvm {
-
-template <> struct DenseMapInfo<HashedType> {
-  static inline HashedType getEmptyKey() { return Empty; }
-
-  static inline HashedType getTombstoneKey() { return Tombstone; }
-
-  static unsigned getHashValue(HashedType Val) { return Val.Hash; }
-
-  static bool isEqual(HashedType LHS, HashedType RHS) {
-    if (RHS.Hash != LHS.Hash)
-      return false;
-    return RHS.Data == LHS.Data;
-  }
-};
-
-} // end namespace llvm
-
 TypeIndex MergingTypeTableBuilder::nextTypeIndex() const {
  return TypeIndex::fromArrayIndex(SeenRecords.size());
 }
@ -56,7 +35,6 @@ TypeIndex MergingTypeTableBuilder::nextTypeIndex() const {
 MergingTypeTableBuilder::MergingTypeTableBuilder(BumpPtrAllocator &Storage)
    : RecordStorage(Storage) {
  SeenRecords.reserve(4096);
-  SeenHashes.reserve(4096);
 }

 MergingTypeTableBuilder::~MergingTypeTableBuilder() = default;
@ -102,13 +80,8 @@ ArrayRef<ArrayRef<uint8_t>> MergingTypeTableBuilder::records() const {
  return SeenRecords;
 }

-ArrayRef<hash_code> MergingTypeTableBuilder::hashes() const {
-  return SeenHashes;
-}
-
 void MergingTypeTableBuilder::reset() {
  HashedRecords.clear();
-  SeenHashes.clear();
  SeenRecords.clear();
 }

@ -124,18 +97,19 @@ TypeIndex MergingTypeTableBuilder::insertRecordAs(hash_code Hash,
  assert(Record.size() < UINT32_MAX && "Record too big");
  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");

-  HashedType TempHashedType = {Hash, Record, nextTypeIndex()};
-  auto Result = HashedRecords.insert(TempHashedType);
+  LocallyHashedType WeakHash{Hash, Record};
+  auto Result = HashedRecords.try_emplace(WeakHash, nextTypeIndex());

  if (Result.second) {
-    Result.first->Data = stabilize(RecordStorage, Record);
-    SeenRecords.push_back(Result.first->Data);
-    SeenHashes.push_back(Result.first->Hash);
+    ArrayRef<uint8_t> RecordData = stabilize(RecordStorage, Record);
+    Result.first->first.RecordData = RecordData;
+    SeenRecords.push_back(RecordData);
  }

  // Update the caller's copy of Record to point a stable copy.
-  Record = Result.first->Data;
-  return Result.first->Index;
+  TypeIndex ActualTI = Result.first->second;
+  Record = SeenRecords[ActualTI.toArrayIndex()];
+  return ActualTI;
 }

 TypeIndex
--- a/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
@ -0,0 +1,74 @@
+//===- TypeHashing.cpp -------------------------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeHashing.h"
+
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+#include "llvm/Support/SHA1.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+LocallyHashedType DenseMapInfo<LocallyHashedType>::Empty{0, {}};
+LocallyHashedType DenseMapInfo<LocallyHashedType>::Tombstone{hash_code(-1), {}};
+
+static std::array<uint8_t, 20> EmptyHash;
+static std::array<uint8_t, 20> TombstoneHash = {
+    0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+GloballyHashedType DenseMapInfo<GloballyHashedType>::Empty{EmptyHash};
+GloballyHashedType DenseMapInfo<GloballyHashedType>::Tombstone{TombstoneHash};
+
+LocallyHashedType LocallyHashedType::hashType(ArrayRef<uint8_t> RecordData) {
+  return {llvm::hash_value(RecordData), RecordData};
+}
+
+GloballyHashedType
+GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
+                             ArrayRef<GloballyHashedType> PreviousTypes,
+                             ArrayRef<GloballyHashedType> PreviousIds) {
+  SmallVector<TiReference, 4> Refs;
+  discoverTypeIndices(RecordData, Refs);
+  SHA1 S;
+  S.init();
+  uint32_t Off = 0;
+  RecordData = RecordData.drop_front(sizeof(RecordPrefix));
+  for (const auto &Ref : Refs) {
+    // Hash any data that comes before this TiRef.
+    uint32_t PreLen = Ref.Offset - Off;
+    ArrayRef<uint8_t> PreData = RecordData.slice(Off, PreLen);
+    S.update(PreData);
+    auto Prev = (Ref.Kind == TiRefKind::IndexRef) ? PreviousIds : PreviousTypes;
+
+    auto RefData = RecordData.slice(Ref.Offset, Ref.Count * sizeof(TypeIndex));
+    // For each type index referenced, add in the previously computed hash
+    // value of that type.
+    ArrayRef<TypeIndex> Indices(
+        reinterpret_cast<const TypeIndex *>(RefData.data()), Ref.Count);
+    for (TypeIndex TI : Indices) {
+      ArrayRef<uint8_t> BytesToHash;
+      if (TI.isSimple() || TI.isNoneType()) {
+        const uint8_t *IndexBytes = reinterpret_cast<const uint8_t *>(&TI);
+        BytesToHash = makeArrayRef(IndexBytes, sizeof(TypeIndex));
+      } else {
+        BytesToHash = Prev[TI.toArrayIndex()].Hash;
+      }
+      S.update(BytesToHash);
+    }
+
+    Off = Ref.Offset + Ref.Count * sizeof(TypeIndex);
+  }
+
+  // Don't forget to add in any trailing bytes.
+  auto TrailingBytes = RecordData.drop_front(Off);
+  S.update(TrailingBytes);
+
+  return {S.final()};
+}
--- a/llvm/unittests/DebugInfo/CodeView/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/CodeView/CMakeLists.txt
@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS

 set(DebugInfoCodeViewSources
  RandomAccessVisitorTest.cpp
+  TypeHashingTest.cpp
  TypeIndexDiscoveryTest.cpp
  )

--- a/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp
+++ b/llvm/unittests/DebugInfo/CodeView/TypeHashingTest.cpp
@ -0,0 +1,156 @@
+//===- llvm/unittest/DebugInfo/CodeView/TypeHashingTest.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeHashing.h"
+#include "llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static TypeIndex createPointerRecord(AppendingTypeTableBuilder &Builder,
+                                     TypeIndex TI) {
+  PointerRecord PR(TypeRecordKind::Pointer);
+  PR.setAttrs(PointerKind::Near32, PointerMode::Pointer, PointerOptions::None,
+              4);
+  PR.ReferentType = TI;
+  return Builder.writeLeafType(PR);
+}
+
+static TypeIndex createArgListRecord(AppendingTypeTableBuilder &Builder,
+                                     TypeIndex Q, TypeIndex R) {
+  ArgListRecord AR(TypeRecordKind::ArgList);
+  AR.ArgIndices.push_back(Q);
+  AR.ArgIndices.push_back(R);
+  return Builder.writeLeafType(AR);
+}
+
+static TypeIndex createProcedureRecord(AppendingTypeTableBuilder &Builder,
+                                       uint32_t ParamCount, TypeIndex Return,
+                                       TypeIndex ArgList) {
+  ProcedureRecord PR(TypeRecordKind::Procedure);
+  PR.ArgumentList = ArgList;
+  PR.CallConv = CallingConvention::NearC;
+  PR.Options = FunctionOptions::None;
+  PR.ParameterCount = ParamCount;
+  PR.ReturnType = Return;
+  return Builder.writeLeafType(PR);
+}
+
+static ArrayRef<uint8_t> hash_of(ArrayRef<GloballyHashedType> Hashes,
+                                 TypeIndex TI) {
+  return Hashes[TI.toArrayIndex()].Hash;
+}
+
+static void verifyHashUniqueness(ArrayRef<GloballyHashedType> Hashes) {
+  assert(!Hashes.empty());
+
+  for (size_t I = 0; I < Hashes.size() - 1; ++I) {
+    for (size_t J = I + 1; J < Hashes.size(); ++J) {
+      EXPECT_NE(Hashes[I].Hash, Hashes[J].Hash);
+    }
+  }
+}
+
+TEST(TypeHashingTest, ContentHash) {
+  SimpleTypeSerializer Serializer;
+
+  TypeIndex CharStar(SimpleTypeKind::SignedCharacter,
+                     SimpleTypeMode::NearPointer32);
+
+  BumpPtrAllocator Alloc;
+  AppendingTypeTableBuilder Ordering1(Alloc);
+  AppendingTypeTableBuilder Ordering2(Alloc);
+
+  TypeIndex CharP(SimpleTypeKind::SignedCharacter, SimpleTypeMode::NearPointer);
+  TypeIndex IntP(SimpleTypeKind::Int32, SimpleTypeMode::NearPointer);
+  TypeIndex DoubleP(SimpleTypeKind::Float64, SimpleTypeMode::NearPointer);
+
+  // We're going to the same type sequence with two different orderings, and
+  // then confirm all records are hashed the same.
+
+  TypeIndex CharPP[2];
+  TypeIndex IntPP[2];
+  TypeIndex IntPPP[2];
+  TypeIndex DoublePP[2];
+  TypeIndex Args[2];
+  TypeIndex Proc[2];
+
+  // Ordering 1
+  // ----------------------------------------
+  // LF_POINTER             0x1000   {char**}
+  //   Referent = char*
+  // LF_POINTER             0x1001   {int**}
+  //   Referent = int*
+  // LF_POINTER             0x1002   {int***}
+  //   Referent = 0x1001
+  // LF_ARGLIST             0x1003   {(char**, int***)}
+  //   Arg[0] = 0x1000
+  //   Arg[1] = 0x1002
+  // LF_PROCEDURE           0x1004   {int** func(char**, int***)}
+  //   ArgList = 0x1003
+  //   ReturnType = 0x1001
+  std::vector<GloballyHashedType> Ordering1Hashes;
+  CharPP[0] = createPointerRecord(Ordering1, CharP);
+  IntPP[0] = createPointerRecord(Ordering1, IntP);
+  IntPPP[0] = createPointerRecord(Ordering1, IntPP[0]);
+  Args[0] = createArgListRecord(Ordering1, CharPP[0], IntPPP[0]);
+  Proc[0] = createProcedureRecord(Ordering1, 2, IntPP[0], Args[0]);
+
+  ASSERT_EQ(0x1000U, CharPP[0].getIndex());
+  ASSERT_EQ(0x1001U, IntPP[0].getIndex());
+  ASSERT_EQ(0x1002U, IntPPP[0].getIndex());
+  ASSERT_EQ(0x1003U, Args[0].getIndex());
+  ASSERT_EQ(0x1004U, Proc[0].getIndex());
+
+  auto Hashes1 = GloballyHashedType::hashTypes(Ordering1.records());
+
+  // Ordering 2
+  // ----------------------------------------
+  // LF_POINTER             0x1000   {int**}
+  //   Referent = int*
+  // LF_POINTER             0x1001   {int***}
+  //   Referent = 0x1000
+  // LF_POINTER             0x1002   {char**}
+  //   Referent = char*
+  // LF_POINTER             0x1003   {double**}
+  //   Referent = double*
+  // LF_ARGLIST             0x1004   {(char**, int***)}
+  //   Arg[0] = 0x1002
+  //   Arg[1] = 0x1001
+  // LF_PROCEDURE           0x1005   {int** func(char**, int***)}
+  //   ArgList = 0x1004
+  //   ReturnType = 0x1000
+  IntPP[1] = createPointerRecord(Ordering2, IntP);
+  IntPPP[1] = createPointerRecord(Ordering2, IntPP[1]);
+  CharPP[1] = createPointerRecord(Ordering2, CharP);
+  DoublePP[1] = createPointerRecord(Ordering2, DoubleP);
+  Args[1] = createArgListRecord(Ordering2, CharPP[1], IntPPP[1]);
+  Proc[1] = createProcedureRecord(Ordering2, 2, IntPP[1], Args[1]);
+  auto Hashes2 = GloballyHashedType::hashTypes(Ordering2.records());
+
+  ASSERT_EQ(0x1000U, IntPP[1].getIndex());
+  ASSERT_EQ(0x1001U, IntPPP[1].getIndex());
+  ASSERT_EQ(0x1002U, CharPP[1].getIndex());
+  ASSERT_EQ(0x1003U, DoublePP[1].getIndex());
+  ASSERT_EQ(0x1004U, Args[1].getIndex());
+  ASSERT_EQ(0x1005U, Proc[1].getIndex());
+
+  // Sanity check to make sure all same-ordering hashes are different
+  // from each other.
+  verifyHashUniqueness(Hashes1);
+  verifyHashUniqueness(Hashes2);
+
+  EXPECT_EQ(hash_of(Hashes1, IntPP[0]), hash_of(Hashes2, IntPP[1]));
+  EXPECT_EQ(hash_of(Hashes1, IntPPP[0]), hash_of(Hashes2, IntPPP[1]));
+  EXPECT_EQ(hash_of(Hashes1, CharPP[0]), hash_of(Hashes2, CharPP[1]));
+  EXPECT_EQ(hash_of(Hashes1, Args[0]), hash_of(Hashes2, Args[1]));
+  EXPECT_EQ(hash_of(Hashes1, Proc[0]), hash_of(Hashes2, Proc[1]));
+}