[XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode

Summary: We avoid using dynamic memory allocated with the internal allocator in the profile collection service used by profiling mode. We use aligned storage for globals and in-struct storage of objects we dynamically initialize. We also remove the dependency on `Vector<...>` which also internally uses the dynamic allocator in sanitizer_common (InternalAlloc) in favour of the XRay allocator and segmented array implementation. This change addresses llvm.org/PR38577. Reviewers: eizan Reviewed By: eizan Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D50782 llvm-svn: 339978
2018-08-17 01:57:42 +00:00 · 2018-08-17 01:57:42 +00:00 · 21d4a1eec7
parent 973a557338
commit 21d4a1eec7
2 changed files with 112 additions and 102 deletions
--- a/compiler-rt/lib/xray/xray_profile_collector.cc
+++ b/compiler-rt/lib/xray/xray_profile_collector.cc
@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
 #include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
 #include <memory>
 #include <pthread.h>
 #include <utility>
@ -29,7 +29,7 @@ namespace {
 SpinMutex GlobalMutex;
 struct ThreadTrie {
  tid_t TId;
-  FunctionCallTrie *Trie;
+  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
 };

 struct ProfileBuffer {
@ -56,65 +56,68 @@ struct BlockHeader {
  u64 ThreadId;
 };

-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+using ThreadTriesArray = Array<ThreadTrie>;
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
+    AllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
+    ThreadTriesStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+    ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
+    ThreadTriesArrayAllocatorStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+    ProfileBufferArrayAllocatorStorage;
+
+static ThreadTriesArray *ThreadTries = nullptr;
+static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+static void *allocateBuffer(size_t S) {
+  auto B = reinterpret_cast<void *>(internal_mmap(
+      NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+  if (B == MAP_FAILED) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
+    return nullptr;
+  }
+  return B;
+}
+
+static void deallocateBuffer(void *B, size_t S) {
+  if (B == nullptr)
+    return;
+  internal_munmap(B, S);
+}

 } // namespace

 void post(const FunctionCallTrie &T, tid_t TId) {
  static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(&Once, +[] {
-    SpinMutexLock Lock(&GlobalMutex);
-    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
-        profilingFlags()->global_allocator_max);
-    ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-        InternalAlloc(sizeof(Vector<ThreadTrie>)));
-    new (ThreadTries) Vector<ThreadTrie>();
-    ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-        InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-    new (ProfileBuffers) Vector<ProfileBuffer>();
-  });
-  DCHECK_NE(GlobalAllocators, nullptr);
-  DCHECK_NE(ThreadTries, nullptr);
-  DCHECK_NE(ProfileBuffers, nullptr);
+  pthread_once(&Once, +[] { reset(); });

  ThreadTrie *Item = nullptr;
  {
    SpinMutexLock Lock(&GlobalMutex);
-    if (GlobalAllocators == nullptr)
+    if (GlobalAllocators == nullptr || ThreadTries == nullptr)
      return;

-    Item = ThreadTries->PushBack();
+    Item = ThreadTries->Append({});
    Item->TId = TId;
-
-    // Here we're using the internal allocator instead of the managed allocator
-    // because:
-    //
-    // 1) We're not using the segmented array data structure to host
-    //    FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
-    //    which works like a std::vector<...> keeping elements contiguous in
-    //    memory. The segmented array data structure assumes that elements are
-    //    trivially destructible, where FunctionCallTrie isn't.
-    //
-    // 2) Using a managed allocator means we need to manage that separately,
-    //    which complicates the nature of this code. To get around that, we're
-    //    using the internal allocator instead, which has its own global state
-    //    and is decoupled from the lifetime management required by the managed
-    //    allocator we have in XRay.
-    //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
-        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
-    DCHECK_NE(Item->Trie, nullptr);
-    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+    auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+    new (Trie) FunctionCallTrie(*GlobalAllocators);
  }

-  T.deepCopyInto(*Item->Trie);
+  auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+  T.deepCopyInto(*Trie);
 }

 // A PathArray represents the function id's representing a stack trace. In this
@ -127,18 +130,12 @@ struct ProfileRecord {

  // The Path in this record is the function id's from the leaf to the root of
  // the function call stack as represented from a FunctionCallTrie.
-  PathArray *Path = nullptr;
+  PathArray Path;
  const FunctionCallTrie::Node *Node = nullptr;

  // Constructor for in-place construction.
  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
-      : Path([&] {
-          auto P =
-              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A);
-          return P;
-        }()),
-        Node(N) {}
+      : Path(A), Node(N) {}
 };

 namespace {
@ -167,8 +164,8 @@ static void populateRecords(ProfileRecordArray &PRs,
      // Traverse the Node's parents and as we're doing so, get the FIds in
      // the order they appear.
      for (auto N = Node; N != nullptr; N = N->Parent)
-        Record->Path->Append(N->FId);
-      DCHECK(!Record->Path->empty());
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());

      for (const auto C : Node->Callees)
        DFSStack.Append(C.NodePtr);
@ -183,7 +180,7 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
                 sizeof(Header);
  for (const auto &Record : ProfileRecords) {
    // List of IDs follow:
-    for (const auto FId : *Record.Path)
+    for (const auto FId : Record.Path)
      NextPtr =
          static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
          sizeof(FId);
@ -213,16 +210,21 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
 void serialize() {
  SpinMutexLock Lock(&GlobalMutex);

-  // Clear out the global ProfileBuffers.
-  for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-    InternalFree((*ProfileBuffers)[I].Data);
-  ProfileBuffers->Reset();
+  if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
+      ProfileBuffers == nullptr)
+    return;

-  if (ThreadTries->Size() == 0)
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(B.Data, B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
+
+  if (ThreadTries->empty())
    return;

  // Then repopulate the global ProfileBuffers.
-  for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+  u32 I = 0;
+  for (const auto &ThreadTrie : *ThreadTries) {
    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
    ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
    ProfileRecord::PathAllocator PathAlloc(
@ -233,9 +235,11 @@ void serialize() {
    // use a local allocator and an __xray::Array<...> to store the intermediary
    // data, then compute the size as we're going along. Then we'll allocate the
    // contiguous space to contain the thread buffer data.
-    const auto &Trie = *(*ThreadTries)[I].Trie;
+    const auto &Trie =
+        *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
    if (Trie.getRoots().empty())
      continue;
+
    populateRecords(ProfileRecords, PathAlloc, Trie);
    DCHECK(!Trie.getRoots().empty());
    DCHECK(!ProfileRecords.empty());
@ -251,68 +255,71 @@ void serialize() {
    //   + end of record (8 bytes)
    u32 CumulativeSizes = 0;
    for (const auto &Record : ProfileRecords)
-      CumulativeSizes += 20 + (4 * Record.Path->size());
+      CumulativeSizes += 20 + (4 * Record.Path.size());

-    BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
-    auto Buffer = ProfileBuffers->PushBack();
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto Buffer = ProfileBuffers->Append({});
    Buffer->Size = sizeof(Header) + CumulativeSizes;
-    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+    Buffer->Data = allocateBuffer(Buffer->Size);
    DCHECK_NE(Buffer->Data, nullptr);
    serializeRecords(Buffer, Header, ProfileRecords);
-
-    // Now clean up the ProfileRecords array, one at a time.
-    for (auto &Record : ProfileRecords) {
-      Record.Path->~PathArray();
-      InternalFree(Record.Path);
-    }
  }
 }

 void reset() {
  SpinMutexLock Lock(&GlobalMutex);
+
  if (ProfileBuffers != nullptr) {
    // Clear out the profile buffers that have been serialized.
-    for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-      InternalFree((*ProfileBuffers)[I].Data);
-    ProfileBuffers->Reset();
-    InternalFree(ProfileBuffers);
-    ProfileBuffers = nullptr;
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(B.Data, B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
  }

  if (ThreadTries != nullptr) {
    // Clear out the function call tries per thread.
-    for (uptr I = 0; I < ThreadTries->Size(); ++I) {
-      auto &T = (*ThreadTries)[I];
-      T.Trie->~FunctionCallTrie();
-      InternalFree(T.Trie);
+    for (auto &T : *ThreadTries) {
+      auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
+      Trie->~FunctionCallTrie();
    }
-    ThreadTries->Reset();
-    InternalFree(ThreadTries);
-    ThreadTries = nullptr;
+    ThreadTries->trim(ThreadTries->size());
  }

  // Reset the global allocators.
-  if (GlobalAllocators != nullptr) {
+  if (GlobalAllocators != nullptr)
    GlobalAllocators->~Allocators();
-    InternalFree(GlobalAllocators);
-    GlobalAllocators = nullptr;
-  }
-  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+
+  GlobalAllocators =
+      reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
  new (GlobalAllocators) FunctionCallTrie::Allocators();
  *GlobalAllocators = FunctionCallTrie::InitAllocators();
-  ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-      InternalAlloc(sizeof(Vector<ThreadTrie>)));
-  new (ThreadTries) Vector<ThreadTrie>();
-  ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-      InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-  new (ProfileBuffers) Vector<ProfileBuffer>();
+
+  if (ThreadTriesAllocator != nullptr)
+    ThreadTriesAllocator->~ThreadTriesArrayAllocator();
+
+  ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
+      &ThreadTriesArrayAllocatorStorage);
+  new (ThreadTriesAllocator)
+      ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
+  ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
+  new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
+
+  if (ProfileBuffersAllocator != nullptr)
+    ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
+
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+  new (ProfileBuffersAllocator)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+  new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
 }

 XRayBuffer nextBuffer(XRayBuffer B) {
  SpinMutexLock Lock(&GlobalMutex);

-  if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
    return {nullptr, 0};

  static pthread_once_t Once = PTHREAD_ONCE_INIT;
@ -336,7 +343,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
  BlockHeader Header;
  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
  auto NextBlock = Header.BlockNum + 1;
-  if (NextBlock < ProfileBuffers->Size())
+  if (NextBlock < ProfileBuffers->size())
    return {(*ProfileBuffers)[NextBlock].Data,
            (*ProfileBuffers)[NextBlock].Size};
  return {nullptr, 0};
--- a/compiler-rt/lib/xray/xray_segmented_array.h
+++ b/compiler-rt/lib/xray/xray_segmented_array.h
@ -325,6 +325,9 @@ public:
  /// Remove N Elements from the end. This leaves the blocks behind, and not
  /// require allocation of new blocks for new elements added after trimming.
  void trim(size_t Elements) {
+    if (Elements == 0)
+      return;
+
    DCHECK_LE(Elements, Size);
    DCHECK_GT(Size, 0);
    auto OldSize = Size;