[XRay][compiler-rt][NFC] Refactor global TLS variables behind an accessor function.

Summary:
This change hides all the initialization of thread_local variables used
by the XRay FDR mode implementation behind a function call. This makes
initialization of thread-local data to be done lazily, instead of
eagerly when they're done as globals. It also gives us an isolation
mechanism if/when we want to change the TLS implementation from using
the C++ thread_local keyword, for something more ad-hoc (potentialy
using pthread directly) on some platforms or set-ups where we cannot use
the C++ thread_local variables.

Reviewers: kpw, eizan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D37248

llvm-svn: 311997
This commit is contained in:
Dean Michael Berris 2017-08-29 12:21:45 +00:00
parent 9203afcf0d
commit eca980396e
2 changed files with 132 additions and 103 deletions

View File

@ -223,7 +223,8 @@ void fdrLoggingHandleCustomEvent(void *Event,
(void)Once; (void)Once;
} }
int32_t ReducedEventSize = static_cast<int32_t>(EventSize); int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
if (!isLogInitializedAndReady(*LocalBQ, TSC, CPU, clock_gettime)) auto &TLD = getThreadLocalData();
if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, clock_gettime))
return; return;
// Here we need to prepare the log to handle: // Here we need to prepare the log to handle:
@ -231,7 +232,7 @@ void fdrLoggingHandleCustomEvent(void *Event,
// - The additional data we're going to write. Currently, that's the size of // - The additional data we're going to write. Currently, that's the size of
// the event we're going to dump into the log as free-form bytes. // the event we're going to dump into the log as free-form bytes.
if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) {
LocalBQ = nullptr; TLD.LocalBQ = nullptr;
return; return;
} }
@ -246,9 +247,9 @@ void fdrLoggingHandleCustomEvent(void *Event,
constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU));
std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
RecordPtr += sizeof(CustomEvent); TLD.RecordPtr += sizeof(CustomEvent);
std::memcpy(RecordPtr, Event, ReducedEventSize); std::memcpy(TLD.RecordPtr, Event, ReducedEventSize);
endBufferIfFull(); endBufferIfFull();
} }

View File

@ -104,36 +104,38 @@ static void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
__sanitizer::atomic_sint32_t &LoggingStatus, __sanitizer::atomic_sint32_t &LoggingStatus,
const std::shared_ptr<BufferQueue> &BQ); const std::shared_ptr<BufferQueue> &BQ);
//-----------------------------------------------------------------------------| // Group together thread-local-data in a struct, then hide it behind a function
// The rest of the file is implementation. | // call so that it can be initialized on first use instead of as a global.
//-----------------------------------------------------------------------------| struct ThreadLocalData {
// Functions are implemented in the header for inlining since we don't want | BufferQueue::Buffer Buffer;
// to grow the stack when we've hijacked the binary for logging. | char *RecordPtr = nullptr;
//-----------------------------------------------------------------------------| // The number of FunctionEntry records immediately preceding RecordPtr.
uint8_t NumConsecutiveFnEnters = 0;
namespace { // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
// records preceding RecordPtr.
uint8_t NumTailCalls = 0;
thread_local BufferQueue::Buffer Buffer; // We use a thread_local variable to keep track of which CPUs we've already
thread_local char *RecordPtr = nullptr; // run, and the TSC times for these CPUs. This allows us to stop repeating the
// CPU field in the function records.
//
// We assume that we'll support only 65536 CPUs for x86_64.
uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
uint64_t LastTSC = 0;
uint64_t LastFunctionEntryTSC = 0;
// The number of FunctionEntry records immediately preceding RecordPtr. // Make sure a thread that's ever called handleArg0 has a thread-local
thread_local uint8_t NumConsecutiveFnEnters = 0; // live reference to the buffer queue for this particular instance of
// FDRLogging, and that we're going to clean it up when the thread exits.
std::shared_ptr<BufferQueue> LocalBQ = nullptr;
};
// The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit // Forward-declare, defined later.
// records preceding RecordPtr. static ThreadLocalData &getThreadLocalData();
thread_local uint8_t NumTailCalls = 0;
constexpr auto MetadataRecSize = sizeof(MetadataRecord); static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
constexpr auto FunctionRecSize = sizeof(FunctionRecord); static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
// We use a thread_local variable to keep track of which CPUs we've already
// run, and the TSC times for these CPUs. This allows us to stop repeating the
// CPU field in the function records.
//
// We assume that we'll support only 65536 CPUs for x86_64.
thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
thread_local uint64_t LastTSC = 0;
thread_local uint64_t LastFunctionEntryTSC = 0;
class ThreadExitBufferCleanup { class ThreadExitBufferCleanup {
std::shared_ptr<BufferQueue> &Buffers; std::shared_ptr<BufferQueue> &Buffers;
@ -146,6 +148,8 @@ public:
Buffer(Buffer) {} Buffer(Buffer) {}
~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT { ~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
auto &RecordPtr = TLD.RecordPtr;
if (RecordPtr == nullptr) if (RecordPtr == nullptr)
return; return;
@ -166,19 +170,27 @@ public:
} }
}; };
// Make sure a thread that's ever called handleArg0 has a thread-local static ThreadLocalData &getThreadLocalData() {
// live reference to the buffer queue for this particular instance of thread_local ThreadLocalData TLD;
// FDRLogging, and that we're going to clean it up when the thread exits. thread_local ThreadExitBufferCleanup Cleanup(TLD.LocalBQ, TLD.Buffer);
thread_local std::shared_ptr<BufferQueue>* LocalBQ = return TLD;
new std::shared_ptr<BufferQueue>(); }
thread_local ThreadExitBufferCleanup Cleanup(*LocalBQ, Buffer);
//-----------------------------------------------------------------------------|
// The rest of the file is implementation. |
//-----------------------------------------------------------------------------|
// Functions are implemented in the header for inlining since we don't want |
// to grow the stack when we've hijacked the binary for logging. |
//-----------------------------------------------------------------------------|
namespace {
class RecursionGuard { class RecursionGuard {
bool &Running; volatile bool &Running;
const bool Valid; const bool Valid;
public: public:
explicit RecursionGuard(bool &R) : Running(R), Valid(!R) { explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) {
if (Valid) if (Valid)
Running = true; Running = true;
} }
@ -238,24 +250,29 @@ inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
} }
std::memcpy(MemPtr, Records, sizeof(MetadataRecord) * InitRecordsCount); std::memcpy(MemPtr, Records, sizeof(MetadataRecord) * InitRecordsCount);
MemPtr += sizeof(MetadataRecord) * InitRecordsCount; MemPtr += sizeof(MetadataRecord) * InitRecordsCount;
NumConsecutiveFnEnters = 0; auto &TLD = getThreadLocalData();
NumTailCalls = 0; TLD.NumConsecutiveFnEnters = 0;
TLD.NumTailCalls = 0;
} }
inline void setupNewBuffer(int (*wall_clock_reader)( inline void setupNewBuffer(int (*wall_clock_reader)(
clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
auto &Buffer = TLD.Buffer;
auto &RecordPtr = TLD.RecordPtr;
RecordPtr = static_cast<char *>(Buffer.Buffer); RecordPtr = static_cast<char *>(Buffer.Buffer);
pid_t Tid = syscall(SYS_gettid); pid_t Tid = syscall(SYS_gettid);
timespec TS{0, 0}; timespec TS{0, 0};
// This is typically clock_gettime, but callers have injection ability. // This is typically clock_gettime, but callers have injection ability.
wall_clock_reader(CLOCK_MONOTONIC, &TS); wall_clock_reader(CLOCK_MONOTONIC, &TS);
writeNewBufferPreamble(Tid, TS, RecordPtr); writeNewBufferPreamble(Tid, TS, RecordPtr);
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
} }
inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
char *&MemPtr) XRAY_NEVER_INSTRUMENT { char *&MemPtr) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
MetadataRecord NewCPUId; MetadataRecord NewCPUId;
NewCPUId.Type = uint8_t(RecordType::Metadata); NewCPUId.Type = uint8_t(RecordType::Metadata);
NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
@ -268,32 +285,34 @@ inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
std::memcpy(MemPtr, &NewCPUId, sizeof(MetadataRecord)); std::memcpy(MemPtr, &NewCPUId, sizeof(MetadataRecord));
MemPtr += sizeof(MetadataRecord); MemPtr += sizeof(MetadataRecord);
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
} }
inline void writeNewCPUIdMetadata(uint16_t CPU, inline void writeNewCPUIdMetadata(uint16_t CPU,
uint64_t TSC) XRAY_NEVER_INSTRUMENT { uint64_t TSC) XRAY_NEVER_INSTRUMENT {
writeNewCPUIdMetadata(CPU, TSC, RecordPtr); writeNewCPUIdMetadata(CPU, TSC, getThreadLocalData().RecordPtr);
} }
inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
MetadataRecord EOBMeta; MetadataRecord EOBMeta;
EOBMeta.Type = uint8_t(RecordType::Metadata); EOBMeta.Type = uint8_t(RecordType::Metadata);
EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer);
// For now we don't write any bytes into the Data field. // For now we don't write any bytes into the Data field.
std::memcpy(MemPtr, &EOBMeta, sizeof(MetadataRecord)); std::memcpy(MemPtr, &EOBMeta, sizeof(MetadataRecord));
MemPtr += sizeof(MetadataRecord); MemPtr += sizeof(MetadataRecord);
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
} }
inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
writeEOBMetadata(RecordPtr); writeEOBMetadata(getThreadLocalData().RecordPtr);
} }
inline void writeTSCWrapMetadata(uint64_t TSC, inline void writeTSCWrapMetadata(uint64_t TSC,
char *&MemPtr) XRAY_NEVER_INSTRUMENT { char *&MemPtr) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
MetadataRecord TSCWrap; MetadataRecord TSCWrap;
TSCWrap.Type = uint8_t(RecordType::Metadata); TSCWrap.Type = uint8_t(RecordType::Metadata);
TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
@ -304,12 +323,12 @@ inline void writeTSCWrapMetadata(uint64_t TSC,
std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
std::memcpy(MemPtr, &TSCWrap, sizeof(MetadataRecord)); std::memcpy(MemPtr, &TSCWrap, sizeof(MetadataRecord));
MemPtr += sizeof(MetadataRecord); MemPtr += sizeof(MetadataRecord);
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
} }
inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
writeTSCWrapMetadata(TSC, RecordPtr); writeTSCWrapMetadata(TSC, getThreadLocalData().RecordPtr);
} }
inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
@ -324,36 +343,37 @@ inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
FuncRecord.FuncId = FuncId & ~(0x0F << 28); FuncRecord.FuncId = FuncId & ~(0x0F << 28);
FuncRecord.TSCDelta = TSCDelta; FuncRecord.TSCDelta = TSCDelta;
auto &TLD = getThreadLocalData();
switch (EntryType) { switch (EntryType) {
case XRayEntryType::ENTRY: case XRayEntryType::ENTRY:
++NumConsecutiveFnEnters; ++TLD.NumConsecutiveFnEnters;
FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
break; break;
case XRayEntryType::LOG_ARGS_ENTRY: case XRayEntryType::LOG_ARGS_ENTRY:
// We should not rewind functions with logged args. // We should not rewind functions with logged args.
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
break; break;
case XRayEntryType::EXIT: case XRayEntryType::EXIT:
// If we've decided to log the function exit, we will never erase the log // If we've decided to log the function exit, we will never erase the log
// before it. // before it.
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
NumTailCalls = 0; TLD.NumTailCalls = 0;
FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
break; break;
case XRayEntryType::TAIL: case XRayEntryType::TAIL:
// If we just entered the function we're tail exiting from or erased every // If we just entered the function we're tail exiting from or erased every
// invocation since then, this function entry tail pair is a candidate to // invocation since then, this function entry tail pair is a candidate to
// be erased when the child function exits. // be erased when the child function exits.
if (NumConsecutiveFnEnters > 0) { if (TLD.NumConsecutiveFnEnters > 0) {
++NumTailCalls; ++TLD.NumTailCalls;
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
} else { } else {
// We will never be able to erase this tail call since we have logged // We will never be able to erase this tail call since we have logged
// something in between the function entry and tail exit. // something in between the function entry and tail exit.
NumTailCalls = 0; TLD.NumTailCalls = 0;
NumConsecutiveFnEnters = 0; TLD.NumConsecutiveFnEnters = 0;
} }
FuncRecord.RecordKind = FuncRecord.RecordKind =
uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
@ -391,20 +411,21 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
using AlignedFuncStorage = using AlignedFuncStorage =
std::aligned_storage<sizeof(FunctionRecord), std::aligned_storage<sizeof(FunctionRecord),
alignof(FunctionRecord)>::type; alignof(FunctionRecord)>::type;
RecordPtr -= FunctionRecSize; auto &TLD = getThreadLocalData();
TLD.RecordPtr -= FunctionRecSize;
AlignedFuncStorage AlignedFuncRecordBuffer; AlignedFuncStorage AlignedFuncRecordBuffer;
const auto &FuncRecord = *reinterpret_cast<FunctionRecord *>( const auto &FuncRecord = *reinterpret_cast<FunctionRecord *>(
std::memcpy(&AlignedFuncRecordBuffer, RecordPtr, FunctionRecSize)); std::memcpy(&AlignedFuncRecordBuffer, TLD.RecordPtr, FunctionRecSize));
assert(FuncRecord.RecordKind == assert(FuncRecord.RecordKind ==
uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
"Expected to find function entry recording when rewinding."); "Expected to find function entry recording when rewinding.");
assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
"Expected matching function id when rewinding Exit"); "Expected matching function id when rewinding Exit");
--NumConsecutiveFnEnters; --TLD.NumConsecutiveFnEnters;
LastTSC -= FuncRecord.TSCDelta; LastTSC -= FuncRecord.TSCDelta;
// We unwound one call. Update the state and return without writing a log. // We unwound one call. Update the state and return without writing a log.
if (NumConsecutiveFnEnters != 0) { if (TLD.NumConsecutiveFnEnters != 0) {
LastFunctionEntryTSC -= FuncRecord.TSCDelta; LastFunctionEntryTSC -= FuncRecord.TSCDelta;
return; return;
} }
@ -414,8 +435,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
// exited from via this exit. // exited from via this exit.
LastFunctionEntryTSC = 0; LastFunctionEntryTSC = 0;
auto RewindingTSC = LastTSC; auto RewindingTSC = LastTSC;
auto RewindingRecordPtr = RecordPtr - FunctionRecSize; auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
while (NumTailCalls > 0) { while (TLD.NumTailCalls > 0) {
AlignedFuncStorage TailExitRecordBuffer; AlignedFuncStorage TailExitRecordBuffer;
// Rewind the TSC back over the TAIL EXIT record. // Rewind the TSC back over the TAIL EXIT record.
const auto &ExpectedTailExit = const auto &ExpectedTailExit =
@ -438,24 +459,25 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
// This tail call exceeded the threshold duration. It will not be erased. // This tail call exceeded the threshold duration. It will not be erased.
if ((TSC - RewindingTSC) >= thresholdTicks()) { if ((TSC - RewindingTSC) >= thresholdTicks()) {
NumTailCalls = 0; TLD.NumTailCalls = 0;
return; return;
} }
// We can erase a tail exit pair that we're exiting through since // We can erase a tail exit pair that we're exiting through since
// its duration is under threshold. // its duration is under threshold.
--NumTailCalls; --TLD.NumTailCalls;
RewindingRecordPtr -= FunctionRecSize; RewindingRecordPtr -= FunctionRecSize;
RewindingTSC -= ExpectedFunctionEntry.TSCDelta; RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
RecordPtr -= 2 * FunctionRecSize; TLD.RecordPtr -= 2 * FunctionRecSize;
LastTSC = RewindingTSC; LastTSC = RewindingTSC;
} }
} }
inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) { inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
auto EC = BQArg.releaseBuffer(Buffer); auto &TLD = getThreadLocalData();
auto EC = BQArg.releaseBuffer(TLD.Buffer);
if (EC != BufferQueue::ErrorCode::Ok) { if (EC != BufferQueue::ErrorCode::Ok) {
Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer,
BufferQueue::getErrorString(EC)); BufferQueue::getErrorString(EC));
return false; return false;
} }
@ -465,12 +487,14 @@ inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t,
struct timespec *), struct timespec *),
size_t MaxSize) XRAY_NEVER_INSTRUMENT { size_t MaxSize) XRAY_NEVER_INSTRUMENT {
char *BufferStart = static_cast<char *>(Buffer.Buffer); auto &TLD = getThreadLocalData();
if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { char *BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
if ((TLD.RecordPtr + MaxSize) >
(BufferStart + TLD.Buffer.Size - MetadataRecSize)) {
writeEOBMetadata(); writeEOBMetadata();
if (!releaseThreadLocalBuffer(**LocalBQ)) if (!releaseThreadLocalBuffer(*TLD.LocalBQ))
return false; return false;
auto EC = (*LocalBQ)->getBuffer(Buffer); auto EC = TLD.LocalBQ->getBuffer(TLD.Buffer);
if (EC != BufferQueue::ErrorCode::Ok) { if (EC != BufferQueue::ErrorCode::Ok) {
Report("Failed to acquire a buffer; error=%s\n", Report("Failed to acquire a buffer; error=%s\n",
BufferQueue::getErrorString(EC)); BufferQueue::getErrorString(EC));
@ -489,14 +513,15 @@ inline bool isLogInitializedAndReady(
// We should take the opportunity to release the buffer though. // We should take the opportunity to release the buffer though.
auto Status = __sanitizer::atomic_load(&LoggingStatus, auto Status = __sanitizer::atomic_load(&LoggingStatus,
__sanitizer::memory_order_acquire); __sanitizer::memory_order_acquire);
auto &TLD = getThreadLocalData();
if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
if (RecordPtr != nullptr && if (TLD.RecordPtr != nullptr &&
(Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
writeEOBMetadata(); writeEOBMetadata();
if (!releaseThreadLocalBuffer(*LBQ)) if (!releaseThreadLocalBuffer(*LBQ))
return false; return false;
RecordPtr = nullptr; TLD.RecordPtr = nullptr;
LBQ = nullptr; LBQ = nullptr;
return false; return false;
} }
@ -507,11 +532,11 @@ inline bool isLogInitializedAndReady(
writeEOBMetadata(); writeEOBMetadata();
if (!releaseThreadLocalBuffer(*LBQ)) if (!releaseThreadLocalBuffer(*LBQ))
return false; return false;
RecordPtr = nullptr; TLD.RecordPtr = nullptr;
} }
if (Buffer.Buffer == nullptr) { if (TLD.Buffer.Buffer == nullptr) {
auto EC = LBQ->getBuffer(Buffer); auto EC = LBQ->getBuffer(TLD.Buffer);
if (EC != BufferQueue::ErrorCode::Ok) { if (EC != BufferQueue::ErrorCode::Ok) {
auto LS = __sanitizer::atomic_load(&LoggingStatus, auto LS = __sanitizer::atomic_load(&LoggingStatus,
__sanitizer::memory_order_acquire); __sanitizer::memory_order_acquire);
@ -525,10 +550,10 @@ inline bool isLogInitializedAndReady(
setupNewBuffer(wall_clock_reader); setupNewBuffer(wall_clock_reader);
} }
if (CurrentCPU == std::numeric_limits<uint16_t>::max()) { if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
// This means this is the first CPU this thread has ever run on. We set // This means this is the first CPU this thread has ever run on. We set
// the current CPU and record this as the first TSC we've seen. // the current CPU and record this as the first TSC we've seen.
CurrentCPU = CPU; TLD.CurrentCPU = CPU;
writeNewCPUIdMetadata(CPU, TSC); writeNewCPUIdMetadata(CPU, TSC);
} }
@ -536,12 +561,13 @@ inline bool isLogInitializedAndReady(
} // namespace __xray_fdr_internal } // namespace __xray_fdr_internal
inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
auto BufferStart = static_cast<char *>(Buffer.Buffer); auto &TLD = getThreadLocalData();
if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { auto BufferStart = static_cast<char *>(TLD.Buffer.Buffer);
if ((TLD.RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
writeEOBMetadata(); writeEOBMetadata();
if (!releaseThreadLocalBuffer(**LocalBQ)) if (!releaseThreadLocalBuffer(*TLD.LocalBQ))
return; return;
RecordPtr = nullptr; TLD.RecordPtr = nullptr;
} }
} }
@ -555,19 +581,21 @@ inline void processFunctionHook(
// don't want to be clobbering potentially partial writes already happening in // don't want to be clobbering potentially partial writes already happening in
// the thread. We use a simple thread_local latch to only allow one on-going // the thread. We use a simple thread_local latch to only allow one on-going
// handleArg0 to happen at any given time. // handleArg0 to happen at any given time.
thread_local bool Running = false; thread_local volatile bool Running = false;
RecursionGuard Guard{Running}; RecursionGuard Guard{Running};
if (!Guard) { if (!Guard) {
assert(Running == true && "RecursionGuard is buggy!"); assert(Running == true && "RecursionGuard is buggy!");
return; return;
} }
auto &TLD = getThreadLocalData();
// In case the reference has been cleaned up before, we make sure we // In case the reference has been cleaned up before, we make sure we
// initialize it to the provided BufferQueue. // initialize it to the provided BufferQueue.
if ((*LocalBQ) == nullptr) if (TLD.LocalBQ == nullptr)
*LocalBQ = BQ; TLD.LocalBQ = BQ;
if (!isLogInitializedAndReady(*LocalBQ, TSC, CPU, wall_clock_reader)) if (!isLogInitializedAndReady(TLD.LocalBQ, TSC, CPU, wall_clock_reader))
return; return;
// Before we go setting up writing new function entries, we need to be really // Before we go setting up writing new function entries, we need to be really
@ -607,14 +635,14 @@ inline void processFunctionHook(
// Buffer, set it up properly before doing any further writing. // Buffer, set it up properly before doing any further writing.
// //
if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) {
*LocalBQ = nullptr; TLD.LocalBQ = nullptr;
return; return;
} }
// By this point, we are now ready to write at most 24 bytes (one metadata // By this point, we are now ready to write at most 24 bytes (one metadata
// record and one function record). // record and one function record).
assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - assert((TLD.RecordPtr + (MetadataRecSize + FunctionRecSize)) -
static_cast<char *>(Buffer.Buffer) >= static_cast<char *>(TLD.Buffer.Buffer) >=
static_cast<ptrdiff_t>(MetadataRecSize) && static_cast<ptrdiff_t>(MetadataRecSize) &&
"Misconfigured BufferQueue provided; Buffer size not large enough."); "Misconfigured BufferQueue provided; Buffer size not large enough.");
@ -638,36 +666,36 @@ inline void processFunctionHook(
// the correct TSC delta. // the correct TSC delta.
// //
uint32_t RecordTSCDelta = 0; uint32_t RecordTSCDelta = 0;
if (CPU != CurrentCPU) { if (CPU != TLD.CurrentCPU) {
// We've moved to a new CPU. // We've moved to a new CPU.
writeNewCPUIdMetadata(CPU, TSC); writeNewCPUIdMetadata(CPU, TSC);
} else { } else {
// If the delta is greater than the range for a uint32_t, then we write out // If the delta is greater than the range for a uint32_t, then we write out
// the TSC wrap metadata entry with the full TSC, and the TSC for the // the TSC wrap metadata entry with the full TSC, and the TSC for the
// function record be 0. // function record be 0.
auto Delta = TSC - LastTSC; auto Delta = TSC - TLD.LastTSC;
if (Delta > (1ULL << 32) - 1) if (Delta > (1ULL << 32) - 1)
writeTSCWrapMetadata(TSC); writeTSCWrapMetadata(TSC);
else else
RecordTSCDelta = Delta; RecordTSCDelta = Delta;
} }
LastTSC = TSC; TLD.LastTSC = TSC;
CurrentCPU = CPU; TLD.CurrentCPU = CPU;
switch (Entry) { switch (Entry) {
case XRayEntryType::ENTRY: case XRayEntryType::ENTRY:
case XRayEntryType::LOG_ARGS_ENTRY: case XRayEntryType::LOG_ARGS_ENTRY:
// Update the thread local state for the next invocation. // Update the thread local state for the next invocation.
LastFunctionEntryTSC = TSC; TLD.LastFunctionEntryTSC = TSC;
break; break;
case XRayEntryType::TAIL: case XRayEntryType::TAIL:
break; break;
case XRayEntryType::EXIT: case XRayEntryType::EXIT:
// Break out and write the exit record if we can't erase any functions. // Break out and write the exit record if we can't erase any functions.
if (NumConsecutiveFnEnters == 0 || if (TLD.NumConsecutiveFnEnters == 0 ||
(TSC - LastFunctionEntryTSC) >= thresholdTicks()) (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks())
break; break;
rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
return; // without writing log. return; // without writing log.
case XRayEntryType::CUSTOM_EVENT: { case XRayEntryType::CUSTOM_EVENT: {
// This is a bug in patching, so we'll report it once and move on. // This is a bug in patching, so we'll report it once and move on.
@ -682,7 +710,7 @@ inline void processFunctionHook(
} }
} }
writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); writeFunctionRecord(FuncId, RecordTSCDelta, Entry, TLD.RecordPtr);
// If we've exhausted the buffer by this time, we then release the buffer to // If we've exhausted the buffer by this time, we then release the buffer to
// make sure that other threads may start using this buffer. // make sure that other threads may start using this buffer.