diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index 795441122cb5..e368b75903e8 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -18,6 +18,8 @@ set(XRAY_BASIC_MODE_SOURCES xray_basic_flags.cc xray_basic_logging.cc) +set(XRAY_PROFILER_MODE_SOURCES + xray_profiler_flags.cc) # Implementation files for all XRay architectures. set(x86_64_SOURCES @@ -98,6 +100,12 @@ if (APPLE) SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS}) + add_compiler_rt_object_libraries(RTXrayPROFILER + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${XRAY_PROFILER_MODE_SOURCES} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) # We only support running on osx for now. add_compiler_rt_runtime(clang_rt.xray @@ -132,6 +140,16 @@ if (APPLE) LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-profiler + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayPROFILER + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) else() # not Apple foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT CAN_TARGET_${arch}) @@ -149,6 +167,10 @@ else() # not Apple ARCHS ${arch} SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS}) + add_compiler_rt_object_libraries(RTXrayPROFILER + ARCHS ${arch} + SOURCES ${XRAY_PROFILER_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) # Common XRay archive for instrumented binaries. add_compiler_rt_runtime(clang_rt.xray @@ -174,6 +196,13 @@ else() # not Apple DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-profiler + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILER + PARENT_TARGET xray) endforeach() endif() # not Apple diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt index f3cc85fc79f4..eeca10dc0998 100644 --- a/compiler-rt/lib/xray/tests/CMakeLists.txt +++ b/compiler-rt/lib/xray/tests/CMakeLists.txt @@ -72,6 +72,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS) add_xray_lib("RTXRay.test.osx" $ $ + $ $ $) else() @@ -79,6 +80,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS) add_xray_lib("RTXRay.test.${arch}" $ $ + $ $ $) endforeach() diff --git a/compiler-rt/lib/xray/tests/unit/CMakeLists.txt b/compiler-rt/lib/xray/tests/unit/CMakeLists.txt index 02fd6f1aee77..abe43202fe31 100644 --- a/compiler-rt/lib/xray/tests/unit/CMakeLists.txt +++ b/compiler-rt/lib/xray/tests/unit/CMakeLists.txt @@ -6,3 +6,5 @@ add_xray_unittest(XRayAllocatorTest SOURCES allocator_test.cc xray_unit_test_main.cc) add_xray_unittest(XRaySegmentedArrayTest SOURCES segmented_array_test.cc xray_unit_test_main.cc) +add_xray_unittest(XRayFunctionCallTrieTest SOURCES + function_call_trie_test.cc xray_unit_test_main.cc) diff --git a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc new file mode 100644 index 000000000000..3aaa98804901 --- /dev/null +++ b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc @@ -0,0 +1,253 @@ +//===-- function_call_trie_test.cc ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "gtest/gtest.h" + +#include "xray_function_call_trie.h" + +namespace __xray { + +namespace { + +TEST(FunctionCallTrieTest, Construction) { + // We want to make sure that we can create one of these without the set of + // allocators we need. This will by default use the global allocators. + FunctionCallTrie Trie; +} + +TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) { + // FIXME: Support passing in configuration for allocators in the allocator + // constructors. + profilerFlags()->setDefaults(); + FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(Allocators); +} + +TEST(FunctionCallTrieTest, EnterAndExitFunction) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // We need a way to pull the data out. At this point, until we get a data + // collection service implemented, we're going to export the data as a list of + // roots, and manually walk through the structure ourselves. + + const auto &R = Trie.getRoots(); + + ASSERT_EQ(R.size(), 1u); + ASSERT_EQ(R.front()->FId, 1); + ASSERT_EQ(R.front()->CallCount, 1); + ASSERT_EQ(R.front()->CumulativeLocalTime, 1u); +} + +TEST(FunctionCallTrieTest, MissingFunctionEntry) { + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.exitFunction(1, 1); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + +TEST(FunctionCallTrieTest, MissingFunctionExit) { + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + Trie.enterFunction(1, 1); + const auto &R = Trie.getRoots(); + + ASSERT_TRUE(R.empty()); +} + +TEST(FunctionCallTrieTest, MultipleRoots) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + // Enter and exit FId = 1. + Trie.enterFunction(1, 1); + Trie.exitFunction(1, 2); + + // Enter and exit FId = 2. + Trie.enterFunction(2, 3); + Trie.exitFunction(2, 4); + + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 2u); + + // Make sure the roots have different IDs. + const auto R0 = R[0]; + const auto R1 = R[1]; + ASSERT_NE(R0->FId, R1->FId); + + // Inspect the roots that they have the right data. + ASSERT_NE(R0, nullptr); + EXPECT_EQ(R0->CallCount, 1u); + EXPECT_EQ(R0->CumulativeLocalTime, 1u); + + ASSERT_NE(R1, nullptr); + EXPECT_EQ(R1->CallCount, 1u); + EXPECT_EQ(R1->CumulativeLocalTime, 1u); +} + +// While missing an intermediary entry may be rare in practice, we still enforce +// that we can handle the case where we've missed the entry event somehow, in +// between call entry/exits. To illustrate, imagine the following shadow call +// stack: +// +// f0@t0 -> f1@t1 -> f2@t2 +// +// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for +// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of +// accounting local time to `f2` from d = (t3 - t2), then local time to `f1` +// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'. +TEST(FunctionCallTrieTest, MissingIntermediaryExit) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 100); + Trie.enterFunction(3, 200); + Trie.exitFunction(3, 300); + Trie.exitFunction(1, 400); + + // What we should see at this point is all the functions in the trie in a + // specific order (1 -> 2 -> 3) with the appropriate count(s) and local + // latencies. + const auto &R = Trie.getRoots(); + ASSERT_FALSE(R.empty()); + ASSERT_EQ(R.size(), 1u); + + const auto &F1 = *R[0]; + ASSERT_EQ(F1.FId, 1); + ASSERT_FALSE(F1.Callees.empty()); + + const auto &F2 = *F1.Callees[0].NodePtr; + ASSERT_EQ(F2.FId, 2); + ASSERT_FALSE(F2.Callees.empty()); + + const auto &F3 = *F2.Callees[0].NodePtr; + ASSERT_EQ(F3.FId, 3); + ASSERT_TRUE(F3.Callees.empty()); + + // Now that we've established the preconditions, we check for specific aspects + // of the nodes. + EXPECT_EQ(F3.CallCount, 1); + EXPECT_EQ(F2.CallCount, 1); + EXPECT_EQ(F1.CallCount, 1); + EXPECT_EQ(F3.CumulativeLocalTime, 100); + EXPECT_EQ(F2.CumulativeLocalTime, 300); + EXPECT_EQ(F1.CumulativeLocalTime, 100); +} + +// TODO: Test that we can handle cross-CPU migrations, where TSCs are not +// guaranteed to be synchronised. +TEST(FunctionCallTrieTest, DeepCopy) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Trie(A); + + Trie.enterFunction(1, 0); + Trie.enterFunction(2, 1); + Trie.exitFunction(2, 2); + Trie.enterFunction(3, 3); + Trie.exitFunction(3, 4); + Trie.exitFunction(1, 5); + + // We want to make a deep copy and compare notes. + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Copy(B); + Trie.deepCopyInto(Copy); + + ASSERT_NE(Trie.getRoots().size(), 0u); + ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size()); + const auto &R0Orig = *Trie.getRoots()[0]; + const auto &R0Copy = *Copy.getRoots()[0]; + EXPECT_EQ(R0Orig.FId, 1); + EXPECT_EQ(R0Orig.FId, R0Copy.FId); + + ASSERT_EQ(R0Orig.Callees.size(), 2u); + ASSERT_EQ(R0Copy.Callees.size(), 2u); + + const auto &F1Orig = + *R0Orig.Callees + .find_element( + [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; }) + ->NodePtr; + const auto &F1Copy = + *R0Copy.Callees + .find_element( + [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; }) + ->NodePtr; + EXPECT_EQ(&R0Orig, F1Orig.Parent); + EXPECT_EQ(&R0Copy, F1Copy.Parent); +} + +TEST(FunctionCallTrieTest, MergeInto) { + profilerFlags()->setDefaults(); + auto A = FunctionCallTrie::InitAllocators(); + FunctionCallTrie T0(A); + FunctionCallTrie T1(A); + + // 1 -> 2 -> 3 + T0.enterFunction(1, 0); + T0.enterFunction(2, 1); + T0.enterFunction(3, 2); + T0.exitFunction(3, 3); + T0.exitFunction(2, 4); + T0.exitFunction(1, 5); + + // 1 -> 2 -> 3 + T1.enterFunction(1, 0); + T1.enterFunction(2, 1); + T1.enterFunction(3, 2); + T1.exitFunction(3, 3); + T1.exitFunction(2, 4); + T1.exitFunction(1, 5); + + // We use a different allocator here to make sure that we're able to transfer + // data into a FunctionCallTrie which uses a different allocator. This + // reflects the inteded usage scenario for when we're collecting profiles that + // aggregate across threads. + auto B = FunctionCallTrie::InitAllocators(); + FunctionCallTrie Merged(B); + + T0.mergeInto(Merged); + T1.mergeInto(Merged); + + ASSERT_EQ(Merged.getRoots().size(), 1u); + const auto &R0 = *Merged.getRoots()[0]; + EXPECT_EQ(R0.FId, 1); + EXPECT_EQ(R0.CallCount, 2); + EXPECT_EQ(R0.CumulativeLocalTime, 10); + EXPECT_EQ(R0.Callees.size(), 1u); + + const auto &F1 = *R0.Callees[0].NodePtr; + EXPECT_EQ(F1.FId, 2); + EXPECT_EQ(F1.CallCount, 2); + EXPECT_EQ(F1.CumulativeLocalTime, 6); + EXPECT_EQ(F1.Callees.size(), 1u); + + const auto &F2 = *F1.Callees[0].NodePtr; + EXPECT_EQ(F2.FId, 3); + EXPECT_EQ(F2.CallCount, 2); + EXPECT_EQ(F2.CumulativeLocalTime, 2); + EXPECT_EQ(F2.Callees.size(), 0u); +} + +} // namespace + +} // namespace __xray diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h new file mode 100644 index 000000000000..af262f847dfd --- /dev/null +++ b/compiler-rt/lib/xray/xray_function_call_trie.h @@ -0,0 +1,446 @@ +//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This file defines the interface for a function call trie. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FUNCTION_CALL_TRIE_H +#define XRAY_FUNCTION_CALL_TRIE_H + +#include "xray_profiler_flags.h" +#include "xray_segmented_array.h" +#include + +namespace __xray { + +/// A FunctionCallTrie represents the stack traces of XRay instrumented +/// functions that we've encountered, where a node corresponds to a function and +/// the path from the root to the node its stack trace. Each node in the trie +/// will contain some useful values, including: +/// +/// * The cumulative amount of time spent in this particular node/stack. +/// * The number of times this stack has appeared. +/// * A histogram of latencies for that particular node. +/// +/// Each node in the trie will also contain a list of callees, represented using +/// a Array -- each NodeIdPair instance will contain the function +/// ID of the callee, and a pointer to the node. +/// +/// If we visualise this data structure, we'll find the following potential +/// representation: +/// +/// [function id node] -> [callees] [cumulative time] +/// [call counter] [latency histogram] +/// +/// As an example, when we have a function in this pseudocode: +/// +/// func f(N) { +/// g() +/// h() +/// for i := 1..N { j() } +/// } +/// +/// We may end up with a trie of the following form: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ ... ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// +/// If for instance the function g() called j() like so: +/// +/// func g() { +/// for i := 1..10 { j() } +/// } +/// +/// We'll find the following updated trie: +/// +/// f -> [ g, h, j ] [...] [1] [...] +/// g -> [ j' ] [...] [1] [...] +/// h -> [ ... ] [...] [1] [...] +/// j -> [ ... ] [...] [N] [...] +/// j' -> [ ... ] [...] [10] [...] +/// +/// Note that we'll have a new node representing the path `f -> g -> j'` with +/// isolated data. This isolation gives us a means of representing the stack +/// traces as a path, as opposed to a key in a table. The alternative +/// implementation here would be to use a separate table for the path, and use +/// hashes of the path as an identifier to accumulate the information. We've +/// moved away from this approach as it takes a lot of time to compute the hash +/// every time we need to update a function's call information as we're handling +/// the entry and exit events. +/// +/// This approach allows us to maintain a shadow stack, which represents the +/// currently executing path, and on function exits quickly compute the amount +/// of time elapsed from the entry, then update the counters for the node +/// already represented in the trie. This necessitates an efficient +/// representation of the various data structures (the list of callees must be +/// cache-aware and efficient to look up, and the histogram must be compact and +/// quick to update) to enable us to keep the overheads of this implementation +/// to the minimum. +class FunctionCallTrie { +public: + struct Node; + + // We use a NodeIdPair type instead of a std::pair<...> to not rely on the + // standard library types in this header. + struct NodeIdPair { + Node *NodePtr; + int32_t FId; + + // Constructor for inplace-construction. + NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} + }; + + using NodeIdPairArray = Array; + using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType; + + // A Node in the FunctionCallTrie gives us a list of callees, the cumulative + // number of times this node actually appeared, the cumulative amount of time + // for this particular node including its children call times, and just the + // local time spent on this node. Each Node will have the ID of the XRay + // instrumented function that it is associated to. + struct Node { + Node *Parent; + NodeIdPairArray Callees; + int64_t CallCount; + int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. + int32_t FId; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT, + int32_t F) + : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT), + FId(F) {} + + // TODO: Include the compact histogram. + }; + +private: + struct ShadowStackEntry { + int32_t FId; // We're copying the function ID into the stack to avoid having + // to reach into the node just to get the function ID. + uint64_t EntryTSC; + Node *NodePtr; + + // We add a constructor here to allow us to inplace-construct through + // Array<...>'s AppendEmplace. + ShadowStackEntry(int32_t F, uint64_t T, Node *N) + : FId(F), EntryTSC(T), NodePtr(N) {} + }; + + using NodeArray = Array; + using RootArray = Array; + using ShadowStackArray = Array; + +public: + // We collate the allocators we need into a single struct, as a convenience to + // allow us to initialize these as a group. + struct Allocators { + using NodeAllocatorType = NodeArray::AllocatorType; + using RootAllocatorType = RootArray::AllocatorType; + using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + using NodeIdPairAllocatorType = NodeIdPairAllocatorType; + + NodeAllocatorType *NodeAllocator = nullptr; + RootAllocatorType *RootAllocator = nullptr; + ShadowStackAllocatorType *ShadowStackAllocator = nullptr; + NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + + Allocators() {} + Allocators(const Allocators &) = delete; + Allocators &operator=(const Allocators &) = delete; + + Allocators(Allocators &&O) + : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator), + ShadowStackAllocator(O.ShadowStackAllocator), + NodeIdPairAllocator(O.NodeIdPairAllocator) { + O.NodeAllocator = nullptr; + O.RootAllocator = nullptr; + O.ShadowStackAllocator = nullptr; + O.NodeIdPairAllocator = nullptr; + } + + Allocators &operator=(Allocators &&O) { + { + auto Tmp = O.NodeAllocator; + O.NodeAllocator = this->NodeAllocator; + this->NodeAllocator = Tmp; + } + { + auto Tmp = O.RootAllocator; + O.RootAllocator = this->RootAllocator; + this->RootAllocator = Tmp; + } + { + auto Tmp = O.ShadowStackAllocator; + O.ShadowStackAllocator = this->ShadowStackAllocator; + this->ShadowStackAllocator = Tmp; + } + { + auto Tmp = O.NodeIdPairAllocator; + O.NodeIdPairAllocator = this->NodeIdPairAllocator; + this->NodeIdPairAllocator = Tmp; + } + return *this; + } + + ~Allocators() { + // Note that we cannot use delete on these pointers, as they need to be + // returned to the sanitizer_common library's internal memory tracking + // system. + if (NodeAllocator != nullptr) { + NodeAllocator->~NodeAllocatorType(); + InternalFree(NodeAllocator); + } + if (RootAllocator != nullptr) { + RootAllocator->~RootAllocatorType(); + InternalFree(RootAllocator); + } + if (ShadowStackAllocator != nullptr) { + ShadowStackAllocator->~ShadowStackAllocatorType(); + InternalFree(ShadowStackAllocator); + } + if (NodeIdPairAllocator != nullptr) { + NodeIdPairAllocator->~NodeIdPairAllocatorType(); + InternalFree(NodeIdPairAllocator); + } + } + }; + + // TODO: Support configuration of options through the arguments. + static Allocators InitAllocators() { + Allocators A; + auto NodeAllocator = reinterpret_cast( + InternalAlloc(sizeof(Allocators::NodeAllocatorType))); + new (NodeAllocator) Allocators::NodeAllocatorType( + profilerFlags()->per_thread_allocator_max, 0); + A.NodeAllocator = NodeAllocator; + + auto RootAllocator = reinterpret_cast( + InternalAlloc(sizeof(Allocators::RootAllocatorType))); + new (RootAllocator) Allocators::RootAllocatorType( + profilerFlags()->per_thread_allocator_max, 0); + A.RootAllocator = RootAllocator; + + auto ShadowStackAllocator = + reinterpret_cast( + InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); + new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType( + profilerFlags()->per_thread_allocator_max, 0); + A.ShadowStackAllocator = ShadowStackAllocator; + + auto NodeIdPairAllocator = + reinterpret_cast( + InternalAlloc(sizeof(Allocators::NodeIdPairAllocatorType))); + new (NodeIdPairAllocator) Allocators::NodeIdPairAllocatorType( + profilerFlags()->per_thread_allocator_max, 0); + A.NodeIdPairAllocator = NodeIdPairAllocator; + return A; + } + +private: + NodeArray Nodes; + RootArray Roots; + ShadowStackArray ShadowStack; + NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + + const Allocators &GetGlobalAllocators() { + static const Allocators A = [] { return InitAllocators(); }(); + return A; + } + +public: + explicit FunctionCallTrie(const Allocators &A) + : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), + ShadowStack(*A.ShadowStackAllocator), + NodeIdPairAllocator(A.NodeIdPairAllocator) {} + + FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {} + + void enterFunction(int32_t FId, uint64_t TSC) { + // This function primarily deals with ensuring that the ShadowStack is + // consistent and ready for when an exit event is encountered. + if (UNLIKELY(ShadowStack.empty())) { + auto NewRoot = + Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId); + if (UNLIKELY(NewRoot == nullptr)) + return; + Roots.Append(NewRoot); + ShadowStack.AppendEmplace(FId, TSC, NewRoot); + return; + } + + auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + + // If we've seen this callee before, then we just access that node and place + // that on the top of the stack. + auto Callee = TopNode->Callees.find_element( + [FId](const NodeIdPair &NR) { return NR.FId == FId; }); + if (Callee != nullptr) { + CHECK_NE(Callee->NodePtr, nullptr); + ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr); + return; + } + + // This means we've never seen this stack before, create a new node here. + auto NewNode = + Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); + if (UNLIKELY(NewNode == nullptr)) + return; + TopNode->Callees.AppendEmplace(NewNode, FId); + ShadowStack.AppendEmplace(FId, TSC, NewNode); + return; + } + + void exitFunction(int32_t FId, uint64_t TSC) { + // When we exit a function, we look up the ShadowStack to see whether we've + // entered this function before. We do as little processing here as we can, + // since most of the hard work would have already been done at function + // entry. + if (UNLIKELY(ShadowStack.empty())) + return; + + uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { + auto &Top = ShadowStack.back(); + auto TopNode = Top.NodePtr; + auto TopFId = TopNode->FId; + auto LocalTime = TSC - Top.EntryTSC; + TopNode->CallCount++; + TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; + CumulativeTreeTime += LocalTime; + ShadowStack.trim(1); + + // TODO: Update the histogram for the node. + if (TopFId == FId) + break; + } + } + + const RootArray &getRoots() const { return Roots; } + + // The deepCopyInto operation will update the provided FunctionCallTrie by + // re-creating the contents of this particular FunctionCallTrie in the other + // FunctionCallTrie. It will do this using a Depth First Traversal from the + // roots, and while doing so recreating the traversal in the provided + // FunctionCallTrie. + // + // This operation will *not* destroy the state in `O`, and thus may cause some + // duplicate entries in `O` if it is not empty. + // + // This function is *not* thread-safe, and may require external + // synchronisation of both "this" and |O|. + // + // This function must *not* be called with a non-empty FunctionCallTrie |O|. + void deepCopyInto(FunctionCallTrie &O) const { + DCHECK(O.getRoots().empty()); + for (const auto Root : getRoots()) { + // Add a node in O for this root. + auto NewRoot = O.Nodes.AppendEmplace( + nullptr, *O.NodeIdPairAllocator, Root->CallCount, + Root->CumulativeLocalTime, Root->FId); + O.Roots.Append(NewRoot); + + // We then push the root into a stack, to use as the parent marker for new + // nodes we push in as we're traversing depth-first down the call tree. + struct NodeAndParent { + FunctionCallTrie::Node *Node; + FunctionCallTrie::Node *NewNode; + }; + using Stack = Array; + + typename Stack::AllocatorType StackAllocator( + profilerFlags()->stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + + // TODO: Figure out what to do if we fail to allocate any more stack + // space. Maybe warn or report once? + DFSStack.Append(NodeAndParent{Root, NewRoot}); + while (!DFSStack.empty()) { + NodeAndParent NP = DFSStack.back(); + DCHECK_NE(NP.Node, nullptr); + DCHECK_NE(NP.NewNode, nullptr); + DFSStack.trim(1); + for (const auto Callee : NP.Node->Callees) { + auto NewNode = O.Nodes.AppendEmplace( + NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, + Callee.NodePtr->CumulativeLocalTime, Callee.FId); + DCHECK_NE(NewNode, nullptr); + NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); + DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode}); + } + } + } + } + + // The mergeInto operation will update the provided FunctionCallTrie by + // traversing the current trie's roots and updating (i.e. merging) the data in + // the nodes with the data in the target's nodes. If the node doesn't exist in + // the provided trie, we add a new one in the right position, and inherit the + // data from the original (current) trie, along with all its callees. + // + // This function is *not* thread-safe, and may require external + // synchronisation of both "this" and |O|. + void mergeInto(FunctionCallTrie &O) const { + struct NodeAndTarget { + FunctionCallTrie::Node *OrigNode; + FunctionCallTrie::Node *TargetNode; + }; + using Stack = Array; + typename Stack::AllocatorType StackAllocator( + profilerFlags()->stack_allocator_max, 0); + Stack DFSStack(StackAllocator); + + for (const auto Root : getRoots()) { + Node *TargetRoot = nullptr; + auto R = O.Roots.find_element( + [&](const Node *Node) { return Node->FId == Root->FId; }); + if (R == nullptr) { + TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, + 0, Root->FId); + O.Roots.Append(TargetRoot); + } else { + TargetRoot = *R; + } + + DFSStack.Append(NodeAndTarget{Root, TargetRoot}); + while (!DFSStack.empty()) { + NodeAndTarget NT = DFSStack.back(); + DCHECK_NE(NT.OrigNode, nullptr); + DCHECK_NE(NT.TargetNode, nullptr); + DFSStack.trim(1); + // TODO: Update the histogram as well when we have it ready. + NT.TargetNode->CallCount += NT.OrigNode->CallCount; + NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime; + for (const auto Callee : NT.OrigNode->Callees) { + auto TargetCallee = NT.TargetNode->Callees.find_element( + [&](const FunctionCallTrie::NodeIdPair &C) { + return C.FId == Callee.FId; + }); + if (TargetCallee == nullptr) { + auto NewTargetNode = O.Nodes.AppendEmplace( + NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + TargetCallee = + NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId); + } + DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr}); + } + } + } + } +}; + +} // namespace __xray + +#endif // XRAY_FUNCTION_CALL_TRIE_H diff --git a/compiler-rt/lib/xray/xray_profiler_flags.cc b/compiler-rt/lib/xray/xray_profiler_flags.cc new file mode 100644 index 000000000000..7dc20cedf6fb --- /dev/null +++ b/compiler-rt/lib/xray/xray_profiler_flags.cc @@ -0,0 +1,40 @@ +//===-- xray_flags.h -------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay runtime flags. +//===----------------------------------------------------------------------===// + +#include "xray_profiler_flags.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_libc.h" +#include "xray_defs.h" + +namespace __xray { + +// Storage for the profiler flags. +ProfilerFlags xray_profiler_flags_dont_use_directly; + +void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG +} + +void registerProfilerFlags(FlagParser *P, + ProfilerFlags *F) XRAY_NEVER_INSTRUMENT { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) \ + RegisterFlag(P, #Name, Description, &F->Name); +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG +} + +} // namespace __xray diff --git a/compiler-rt/lib/xray/xray_profiler_flags.h b/compiler-rt/lib/xray/xray_profiler_flags.h new file mode 100644 index 000000000000..9a57abb26ab0 --- /dev/null +++ b/compiler-rt/lib/xray/xray_profiler_flags.h @@ -0,0 +1,39 @@ +//===-- xray_profiler_flags.h ----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay profiler runtime flags. +//===----------------------------------------------------------------------===// + +#ifndef XRAY_PROFILER_FLAGS_H +#define XRAY_PROFILER_FLAGS_H + +#include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" + +namespace __xray { + +struct ProfilerFlags { +#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name; +#include "xray_profiler_flags.inc" +#undef XRAY_FLAG + + void setDefaults(); +}; + +extern ProfilerFlags xray_profiler_flags_dont_use_directly; +inline ProfilerFlags *profilerFlags() { + return &xray_profiler_flags_dont_use_directly; +} +void registerProfilerFlags(FlagParser *P, ProfilerFlags *F); + +} // namespace __xray + +#endif // XRAY_PROFILER_FLAGS_H diff --git a/compiler-rt/lib/xray/xray_profiler_flags.inc b/compiler-rt/lib/xray/xray_profiler_flags.inc new file mode 100644 index 000000000000..9266598a19ca --- /dev/null +++ b/compiler-rt/lib/xray/xray_profiler_flags.inc @@ -0,0 +1,26 @@ +//===-- xray_flags.inc ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// XRay profiling runtime flags. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_FLAG +#error "Define XRAY_FLAG prior to including this file!" +#endif + +XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20, + "Maximum size of any single per-thread allocator.") +XRAY_FLAG(uptr, global_allocator_max, 2 << 24, + "Maximum size of the global allocator for profile storage.") +XRAY_FLAG(uptr, stack_allocator_max, 2 << 24, + "Maximum size of the traversal stack allocator.") +XRAY_FLAG(int, grace_period_ms, 100, + "Profile collection will wait this much time in milliseconds before " + "resetting the global state. This gives a chance to threads to " + "notice that the profiler has been finalized and clean up.")