[XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode

Summary:
We avoid using dynamic memory allocated with the internal allocator in
the profile collection service used by profiling mode. We use aligned
storage for globals and in-struct storage of objects we dynamically
initialize.

We also remove the dependency on `Vector<...>` which also internally
uses the dynamic allocator in sanitizer_common (InternalAlloc) in favour
of the XRay allocator and segmented array implementation.

This change addresses llvm.org/PR38577.

Reviewers: eizan

Reviewed By: eizan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D50782

llvm-svn: 339978
This commit is contained in:
Dean Michael Berris 2018-08-17 01:57:42 +00:00
parent 973a557338
commit 21d4a1eec7
2 changed files with 112 additions and 102 deletions

View File

@ -13,10 +13,10 @@
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_vector.h"
#include "xray_allocator.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
#include <memory>
#include <pthread.h>
#include <utility>
@ -29,7 +29,7 @@ namespace {
SpinMutex GlobalMutex;
struct ThreadTrie {
tid_t TId;
FunctionCallTrie *Trie;
typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
};
struct ProfileBuffer {
@ -56,65 +56,68 @@ struct BlockHeader {
u64 ThreadId;
};
// These need to be pointers that point to heap/internal-allocator-allocated
// objects because these are accessed even at program exit.
Vector<ThreadTrie> *ThreadTries = nullptr;
Vector<ProfileBuffer> *ProfileBuffers = nullptr;
FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
using ThreadTriesArray = Array<ThreadTrie>;
using ProfileBufferArray = Array<ProfileBuffer>;
using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
// These need to be global aligned storage to avoid dynamic initialization. We
// need these to be aligned to allow us to placement new objects into the
// storage, and have pointers to those objects be appropriately aligned.
static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
AllocatorStorage;
static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
ThreadTriesStorage;
static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
ProfileBuffersStorage;
static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
ThreadTriesArrayAllocatorStorage;
static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
ProfileBufferArrayAllocatorStorage;
static ThreadTriesArray *ThreadTries = nullptr;
static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
static ProfileBufferArray *ProfileBuffers = nullptr;
static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
static void *allocateBuffer(size_t S) {
auto B = reinterpret_cast<void *>(internal_mmap(
NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
if (B == MAP_FAILED) {
if (Verbosity())
Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
return nullptr;
}
return B;
}
static void deallocateBuffer(void *B, size_t S) {
if (B == nullptr)
return;
internal_munmap(B, S);
}
} // namespace
void post(const FunctionCallTrie &T, tid_t TId) {
static pthread_once_t Once = PTHREAD_ONCE_INIT;
pthread_once(&Once, +[] {
SpinMutexLock Lock(&GlobalMutex);
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
new (GlobalAllocators) FunctionCallTrie::Allocators();
*GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
profilingFlags()->global_allocator_max);
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
InternalAlloc(sizeof(Vector<ThreadTrie>)));
new (ThreadTries) Vector<ThreadTrie>();
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
new (ProfileBuffers) Vector<ProfileBuffer>();
});
DCHECK_NE(GlobalAllocators, nullptr);
DCHECK_NE(ThreadTries, nullptr);
DCHECK_NE(ProfileBuffers, nullptr);
pthread_once(&Once, +[] { reset(); });
ThreadTrie *Item = nullptr;
{
SpinMutexLock Lock(&GlobalMutex);
if (GlobalAllocators == nullptr)
if (GlobalAllocators == nullptr || ThreadTries == nullptr)
return;
Item = ThreadTries->PushBack();
Item = ThreadTries->Append({});
Item->TId = TId;
// Here we're using the internal allocator instead of the managed allocator
// because:
//
// 1) We're not using the segmented array data structure to host
// FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
// which works like a std::vector<...> keeping elements contiguous in
// memory. The segmented array data structure assumes that elements are
// trivially destructible, where FunctionCallTrie isn't.
//
// 2) Using a managed allocator means we need to manage that separately,
// which complicates the nature of this code. To get around that, we're
// using the internal allocator instead, which has its own global state
// and is decoupled from the lifetime management required by the managed
// allocator we have in XRay.
//
Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
DCHECK_NE(Item->Trie, nullptr);
new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
new (Trie) FunctionCallTrie(*GlobalAllocators);
}
T.deepCopyInto(*Item->Trie);
auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
T.deepCopyInto(*Trie);
}
// A PathArray represents the function id's representing a stack trace. In this
@ -127,18 +130,12 @@ struct ProfileRecord {
// The Path in this record is the function id's from the leaf to the root of
// the function call stack as represented from a FunctionCallTrie.
PathArray *Path = nullptr;
PathArray Path;
const FunctionCallTrie::Node *Node = nullptr;
// Constructor for in-place construction.
ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
: Path([&] {
auto P =
reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
new (P) PathArray(A);
return P;
}()),
Node(N) {}
: Path(A), Node(N) {}
};
namespace {
@ -167,8 +164,8 @@ static void populateRecords(ProfileRecordArray &PRs,
// Traverse the Node's parents and as we're doing so, get the FIds in
// the order they appear.
for (auto N = Node; N != nullptr; N = N->Parent)
Record->Path->Append(N->FId);
DCHECK(!Record->Path->empty());
Record->Path.Append(N->FId);
DCHECK(!Record->Path.empty());
for (const auto C : Node->Callees)
DFSStack.Append(C.NodePtr);
@ -183,7 +180,7 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
sizeof(Header);
for (const auto &Record : ProfileRecords) {
// List of IDs follow:
for (const auto FId : *Record.Path)
for (const auto FId : Record.Path)
NextPtr =
static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
sizeof(FId);
@ -213,16 +210,21 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
void serialize() {
SpinMutexLock Lock(&GlobalMutex);
// Clear out the global ProfileBuffers.
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
InternalFree((*ProfileBuffers)[I].Data);
ProfileBuffers->Reset();
if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
ProfileBuffers == nullptr)
return;
if (ThreadTries->Size() == 0)
// Clear out the global ProfileBuffers, if it's not empty.
for (auto &B : *ProfileBuffers)
deallocateBuffer(B.Data, B.Size);
ProfileBuffers->trim(ProfileBuffers->size());
if (ThreadTries->empty())
return;
// Then repopulate the global ProfileBuffers.
for (u32 I = 0; I < ThreadTries->Size(); ++I) {
u32 I = 0;
for (const auto &ThreadTrie : *ThreadTries) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
@ -233,9 +235,11 @@ void serialize() {
// use a local allocator and an __xray::Array<...> to store the intermediary
// data, then compute the size as we're going along. Then we'll allocate the
// contiguous space to contain the thread buffer data.
const auto &Trie = *(*ThreadTries)[I].Trie;
const auto &Trie =
*reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
if (Trie.getRoots().empty())
continue;
populateRecords(ProfileRecords, PathAlloc, Trie);
DCHECK(!Trie.getRoots().empty());
DCHECK(!ProfileRecords.empty());
@ -251,68 +255,71 @@ void serialize() {
// + end of record (8 bytes)
u32 CumulativeSizes = 0;
for (const auto &Record : ProfileRecords)
CumulativeSizes += 20 + (4 * Record.Path->size());
CumulativeSizes += 20 + (4 * Record.Path.size());
BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
auto Buffer = ProfileBuffers->PushBack();
BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
auto Buffer = ProfileBuffers->Append({});
Buffer->Size = sizeof(Header) + CumulativeSizes;
Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
Buffer->Data = allocateBuffer(Buffer->Size);
DCHECK_NE(Buffer->Data, nullptr);
serializeRecords(Buffer, Header, ProfileRecords);
// Now clean up the ProfileRecords array, one at a time.
for (auto &Record : ProfileRecords) {
Record.Path->~PathArray();
InternalFree(Record.Path);
}
}
}
void reset() {
SpinMutexLock Lock(&GlobalMutex);
if (ProfileBuffers != nullptr) {
// Clear out the profile buffers that have been serialized.
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
InternalFree((*ProfileBuffers)[I].Data);
ProfileBuffers->Reset();
InternalFree(ProfileBuffers);
ProfileBuffers = nullptr;
for (auto &B : *ProfileBuffers)
deallocateBuffer(B.Data, B.Size);
ProfileBuffers->trim(ProfileBuffers->size());
}
if (ThreadTries != nullptr) {
// Clear out the function call tries per thread.
for (uptr I = 0; I < ThreadTries->Size(); ++I) {
auto &T = (*ThreadTries)[I];
T.Trie->~FunctionCallTrie();
InternalFree(T.Trie);
for (auto &T : *ThreadTries) {
auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
Trie->~FunctionCallTrie();
}
ThreadTries->Reset();
InternalFree(ThreadTries);
ThreadTries = nullptr;
ThreadTries->trim(ThreadTries->size());
}
// Reset the global allocators.
if (GlobalAllocators != nullptr) {
if (GlobalAllocators != nullptr)
GlobalAllocators->~Allocators();
InternalFree(GlobalAllocators);
GlobalAllocators = nullptr;
}
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
GlobalAllocators =
reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
new (GlobalAllocators) FunctionCallTrie::Allocators();
*GlobalAllocators = FunctionCallTrie::InitAllocators();
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
InternalAlloc(sizeof(Vector<ThreadTrie>)));
new (ThreadTries) Vector<ThreadTrie>();
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
new (ProfileBuffers) Vector<ProfileBuffer>();
if (ThreadTriesAllocator != nullptr)
ThreadTriesAllocator->~ThreadTriesArrayAllocator();
ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
&ThreadTriesArrayAllocatorStorage);
new (ThreadTriesAllocator)
ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
if (ProfileBuffersAllocator != nullptr)
ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
&ProfileBufferArrayAllocatorStorage);
new (ProfileBuffersAllocator)
ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
ProfileBuffers =
reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
}
XRayBuffer nextBuffer(XRayBuffer B) {
SpinMutexLock Lock(&GlobalMutex);
if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
return {nullptr, 0};
static pthread_once_t Once = PTHREAD_ONCE_INIT;
@ -336,7 +343,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
BlockHeader Header;
internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
auto NextBlock = Header.BlockNum + 1;
if (NextBlock < ProfileBuffers->Size())
if (NextBlock < ProfileBuffers->size())
return {(*ProfileBuffers)[NextBlock].Data,
(*ProfileBuffers)[NextBlock].Size};
return {nullptr, 0};

View File

@ -325,6 +325,9 @@ public:
/// Remove N Elements from the end. This leaves the blocks behind, and not
/// require allocation of new blocks for new elements added after trimming.
void trim(size_t Elements) {
if (Elements == 0)
return;
DCHECK_LE(Elements, Size);
DCHECK_GT(Size, 0);
auto OldSize = Size;