forked from OSchip/llvm-project
[XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode
Summary: We avoid using dynamic memory allocated with the internal allocator in the profile collection service used by profiling mode. We use aligned storage for globals and in-struct storage of objects we dynamically initialize. We also remove the dependency on `Vector<...>` which also internally uses the dynamic allocator in sanitizer_common (InternalAlloc) in favour of the XRay allocator and segmented array implementation. This change addresses llvm.org/PR38577. Reviewers: eizan Reviewed By: eizan Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D50782 llvm-svn: 339978
This commit is contained in:
parent
973a557338
commit
21d4a1eec7
|
@ -13,10 +13,10 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "xray_profile_collector.h"
|
||||
#include "sanitizer_common/sanitizer_allocator_internal.h"
|
||||
#include "sanitizer_common/sanitizer_common.h"
|
||||
#include "sanitizer_common/sanitizer_vector.h"
|
||||
#include "xray_allocator.h"
|
||||
#include "xray_profiling_flags.h"
|
||||
#include "xray_segmented_array.h"
|
||||
#include <memory>
|
||||
#include <pthread.h>
|
||||
#include <utility>
|
||||
|
@ -29,7 +29,7 @@ namespace {
|
|||
SpinMutex GlobalMutex;
|
||||
struct ThreadTrie {
|
||||
tid_t TId;
|
||||
FunctionCallTrie *Trie;
|
||||
typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
|
||||
};
|
||||
|
||||
struct ProfileBuffer {
|
||||
|
@ -56,65 +56,68 @@ struct BlockHeader {
|
|||
u64 ThreadId;
|
||||
};
|
||||
|
||||
// These need to be pointers that point to heap/internal-allocator-allocated
|
||||
// objects because these are accessed even at program exit.
|
||||
Vector<ThreadTrie> *ThreadTries = nullptr;
|
||||
Vector<ProfileBuffer> *ProfileBuffers = nullptr;
|
||||
FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
|
||||
using ThreadTriesArray = Array<ThreadTrie>;
|
||||
using ProfileBufferArray = Array<ProfileBuffer>;
|
||||
using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
|
||||
using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
|
||||
|
||||
// These need to be global aligned storage to avoid dynamic initialization. We
|
||||
// need these to be aligned to allow us to placement new objects into the
|
||||
// storage, and have pointers to those objects be appropriately aligned.
|
||||
static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
|
||||
AllocatorStorage;
|
||||
static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
|
||||
ThreadTriesStorage;
|
||||
static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
|
||||
ProfileBuffersStorage;
|
||||
static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
|
||||
ThreadTriesArrayAllocatorStorage;
|
||||
static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
|
||||
ProfileBufferArrayAllocatorStorage;
|
||||
|
||||
static ThreadTriesArray *ThreadTries = nullptr;
|
||||
static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
|
||||
static ProfileBufferArray *ProfileBuffers = nullptr;
|
||||
static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
|
||||
static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
|
||||
|
||||
static void *allocateBuffer(size_t S) {
|
||||
auto B = reinterpret_cast<void *>(internal_mmap(
|
||||
NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
|
||||
if (B == MAP_FAILED) {
|
||||
if (Verbosity())
|
||||
Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
|
||||
return nullptr;
|
||||
}
|
||||
return B;
|
||||
}
|
||||
|
||||
static void deallocateBuffer(void *B, size_t S) {
|
||||
if (B == nullptr)
|
||||
return;
|
||||
internal_munmap(B, S);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void post(const FunctionCallTrie &T, tid_t TId) {
|
||||
static pthread_once_t Once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&Once, +[] {
|
||||
SpinMutexLock Lock(&GlobalMutex);
|
||||
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
|
||||
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
|
||||
new (GlobalAllocators) FunctionCallTrie::Allocators();
|
||||
*GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
|
||||
profilingFlags()->global_allocator_max);
|
||||
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
|
||||
InternalAlloc(sizeof(Vector<ThreadTrie>)));
|
||||
new (ThreadTries) Vector<ThreadTrie>();
|
||||
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
|
||||
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
|
||||
new (ProfileBuffers) Vector<ProfileBuffer>();
|
||||
});
|
||||
DCHECK_NE(GlobalAllocators, nullptr);
|
||||
DCHECK_NE(ThreadTries, nullptr);
|
||||
DCHECK_NE(ProfileBuffers, nullptr);
|
||||
pthread_once(&Once, +[] { reset(); });
|
||||
|
||||
ThreadTrie *Item = nullptr;
|
||||
{
|
||||
SpinMutexLock Lock(&GlobalMutex);
|
||||
if (GlobalAllocators == nullptr)
|
||||
if (GlobalAllocators == nullptr || ThreadTries == nullptr)
|
||||
return;
|
||||
|
||||
Item = ThreadTries->PushBack();
|
||||
Item = ThreadTries->Append({});
|
||||
Item->TId = TId;
|
||||
|
||||
// Here we're using the internal allocator instead of the managed allocator
|
||||
// because:
|
||||
//
|
||||
// 1) We're not using the segmented array data structure to host
|
||||
// FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
|
||||
// which works like a std::vector<...> keeping elements contiguous in
|
||||
// memory. The segmented array data structure assumes that elements are
|
||||
// trivially destructible, where FunctionCallTrie isn't.
|
||||
//
|
||||
// 2) Using a managed allocator means we need to manage that separately,
|
||||
// which complicates the nature of this code. To get around that, we're
|
||||
// using the internal allocator instead, which has its own global state
|
||||
// and is decoupled from the lifetime management required by the managed
|
||||
// allocator we have in XRay.
|
||||
//
|
||||
Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
|
||||
sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
|
||||
DCHECK_NE(Item->Trie, nullptr);
|
||||
new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
|
||||
auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
|
||||
new (Trie) FunctionCallTrie(*GlobalAllocators);
|
||||
}
|
||||
|
||||
T.deepCopyInto(*Item->Trie);
|
||||
auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
|
||||
T.deepCopyInto(*Trie);
|
||||
}
|
||||
|
||||
// A PathArray represents the function id's representing a stack trace. In this
|
||||
|
@ -127,18 +130,12 @@ struct ProfileRecord {
|
|||
|
||||
// The Path in this record is the function id's from the leaf to the root of
|
||||
// the function call stack as represented from a FunctionCallTrie.
|
||||
PathArray *Path = nullptr;
|
||||
PathArray Path;
|
||||
const FunctionCallTrie::Node *Node = nullptr;
|
||||
|
||||
// Constructor for in-place construction.
|
||||
ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
|
||||
: Path([&] {
|
||||
auto P =
|
||||
reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
|
||||
new (P) PathArray(A);
|
||||
return P;
|
||||
}()),
|
||||
Node(N) {}
|
||||
: Path(A), Node(N) {}
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
@ -167,8 +164,8 @@ static void populateRecords(ProfileRecordArray &PRs,
|
|||
// Traverse the Node's parents and as we're doing so, get the FIds in
|
||||
// the order they appear.
|
||||
for (auto N = Node; N != nullptr; N = N->Parent)
|
||||
Record->Path->Append(N->FId);
|
||||
DCHECK(!Record->Path->empty());
|
||||
Record->Path.Append(N->FId);
|
||||
DCHECK(!Record->Path.empty());
|
||||
|
||||
for (const auto C : Node->Callees)
|
||||
DFSStack.Append(C.NodePtr);
|
||||
|
@ -183,7 +180,7 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
|
|||
sizeof(Header);
|
||||
for (const auto &Record : ProfileRecords) {
|
||||
// List of IDs follow:
|
||||
for (const auto FId : *Record.Path)
|
||||
for (const auto FId : Record.Path)
|
||||
NextPtr =
|
||||
static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
|
||||
sizeof(FId);
|
||||
|
@ -213,16 +210,21 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
|
|||
void serialize() {
|
||||
SpinMutexLock Lock(&GlobalMutex);
|
||||
|
||||
// Clear out the global ProfileBuffers.
|
||||
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
|
||||
InternalFree((*ProfileBuffers)[I].Data);
|
||||
ProfileBuffers->Reset();
|
||||
if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
|
||||
ProfileBuffers == nullptr)
|
||||
return;
|
||||
|
||||
if (ThreadTries->Size() == 0)
|
||||
// Clear out the global ProfileBuffers, if it's not empty.
|
||||
for (auto &B : *ProfileBuffers)
|
||||
deallocateBuffer(B.Data, B.Size);
|
||||
ProfileBuffers->trim(ProfileBuffers->size());
|
||||
|
||||
if (ThreadTries->empty())
|
||||
return;
|
||||
|
||||
// Then repopulate the global ProfileBuffers.
|
||||
for (u32 I = 0; I < ThreadTries->Size(); ++I) {
|
||||
u32 I = 0;
|
||||
for (const auto &ThreadTrie : *ThreadTries) {
|
||||
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
|
||||
ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
|
||||
ProfileRecord::PathAllocator PathAlloc(
|
||||
|
@ -233,9 +235,11 @@ void serialize() {
|
|||
// use a local allocator and an __xray::Array<...> to store the intermediary
|
||||
// data, then compute the size as we're going along. Then we'll allocate the
|
||||
// contiguous space to contain the thread buffer data.
|
||||
const auto &Trie = *(*ThreadTries)[I].Trie;
|
||||
const auto &Trie =
|
||||
*reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
|
||||
if (Trie.getRoots().empty())
|
||||
continue;
|
||||
|
||||
populateRecords(ProfileRecords, PathAlloc, Trie);
|
||||
DCHECK(!Trie.getRoots().empty());
|
||||
DCHECK(!ProfileRecords.empty());
|
||||
|
@ -251,68 +255,71 @@ void serialize() {
|
|||
// + end of record (8 bytes)
|
||||
u32 CumulativeSizes = 0;
|
||||
for (const auto &Record : ProfileRecords)
|
||||
CumulativeSizes += 20 + (4 * Record.Path->size());
|
||||
CumulativeSizes += 20 + (4 * Record.Path.size());
|
||||
|
||||
BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
|
||||
auto Buffer = ProfileBuffers->PushBack();
|
||||
BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
|
||||
auto Buffer = ProfileBuffers->Append({});
|
||||
Buffer->Size = sizeof(Header) + CumulativeSizes;
|
||||
Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
|
||||
Buffer->Data = allocateBuffer(Buffer->Size);
|
||||
DCHECK_NE(Buffer->Data, nullptr);
|
||||
serializeRecords(Buffer, Header, ProfileRecords);
|
||||
|
||||
// Now clean up the ProfileRecords array, one at a time.
|
||||
for (auto &Record : ProfileRecords) {
|
||||
Record.Path->~PathArray();
|
||||
InternalFree(Record.Path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
SpinMutexLock Lock(&GlobalMutex);
|
||||
|
||||
if (ProfileBuffers != nullptr) {
|
||||
// Clear out the profile buffers that have been serialized.
|
||||
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
|
||||
InternalFree((*ProfileBuffers)[I].Data);
|
||||
ProfileBuffers->Reset();
|
||||
InternalFree(ProfileBuffers);
|
||||
ProfileBuffers = nullptr;
|
||||
for (auto &B : *ProfileBuffers)
|
||||
deallocateBuffer(B.Data, B.Size);
|
||||
ProfileBuffers->trim(ProfileBuffers->size());
|
||||
}
|
||||
|
||||
if (ThreadTries != nullptr) {
|
||||
// Clear out the function call tries per thread.
|
||||
for (uptr I = 0; I < ThreadTries->Size(); ++I) {
|
||||
auto &T = (*ThreadTries)[I];
|
||||
T.Trie->~FunctionCallTrie();
|
||||
InternalFree(T.Trie);
|
||||
for (auto &T : *ThreadTries) {
|
||||
auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
|
||||
Trie->~FunctionCallTrie();
|
||||
}
|
||||
ThreadTries->Reset();
|
||||
InternalFree(ThreadTries);
|
||||
ThreadTries = nullptr;
|
||||
ThreadTries->trim(ThreadTries->size());
|
||||
}
|
||||
|
||||
// Reset the global allocators.
|
||||
if (GlobalAllocators != nullptr) {
|
||||
if (GlobalAllocators != nullptr)
|
||||
GlobalAllocators->~Allocators();
|
||||
InternalFree(GlobalAllocators);
|
||||
GlobalAllocators = nullptr;
|
||||
}
|
||||
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
|
||||
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
|
||||
|
||||
GlobalAllocators =
|
||||
reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
|
||||
new (GlobalAllocators) FunctionCallTrie::Allocators();
|
||||
*GlobalAllocators = FunctionCallTrie::InitAllocators();
|
||||
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
|
||||
InternalAlloc(sizeof(Vector<ThreadTrie>)));
|
||||
new (ThreadTries) Vector<ThreadTrie>();
|
||||
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
|
||||
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
|
||||
new (ProfileBuffers) Vector<ProfileBuffer>();
|
||||
|
||||
if (ThreadTriesAllocator != nullptr)
|
||||
ThreadTriesAllocator->~ThreadTriesArrayAllocator();
|
||||
|
||||
ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
|
||||
&ThreadTriesArrayAllocatorStorage);
|
||||
new (ThreadTriesAllocator)
|
||||
ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
|
||||
ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
|
||||
new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
|
||||
|
||||
if (ProfileBuffersAllocator != nullptr)
|
||||
ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
|
||||
|
||||
ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
|
||||
&ProfileBufferArrayAllocatorStorage);
|
||||
new (ProfileBuffersAllocator)
|
||||
ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
|
||||
ProfileBuffers =
|
||||
reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
|
||||
new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
|
||||
}
|
||||
|
||||
XRayBuffer nextBuffer(XRayBuffer B) {
|
||||
SpinMutexLock Lock(&GlobalMutex);
|
||||
|
||||
if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
|
||||
if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
|
||||
return {nullptr, 0};
|
||||
|
||||
static pthread_once_t Once = PTHREAD_ONCE_INIT;
|
||||
|
@ -336,7 +343,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
|
|||
BlockHeader Header;
|
||||
internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
|
||||
auto NextBlock = Header.BlockNum + 1;
|
||||
if (NextBlock < ProfileBuffers->Size())
|
||||
if (NextBlock < ProfileBuffers->size())
|
||||
return {(*ProfileBuffers)[NextBlock].Data,
|
||||
(*ProfileBuffers)[NextBlock].Size};
|
||||
return {nullptr, 0};
|
||||
|
|
|
@ -325,6 +325,9 @@ public:
|
|||
/// Remove N Elements from the end. This leaves the blocks behind, and not
|
||||
/// require allocation of new blocks for new elements added after trimming.
|
||||
void trim(size_t Elements) {
|
||||
if (Elements == 0)
|
||||
return;
|
||||
|
||||
DCHECK_LE(Elements, Size);
|
||||
DCHECK_GT(Size, 0);
|
||||
auto OldSize = Size;
|
||||
|
|
Loading…
Reference in New Issue