llvm-project/llvm/lib/Support/PrettyStackTrace.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

301 lines
10 KiB
C++
Raw Normal View History

//===- PrettyStackTrace.cpp - Pretty Crash Handling -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines some helpful functions for dealing with the possibility of
// Unix signals occurring while your program is running.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/PrettyStackTrace.h"
#include "llvm-c/ErrorHandling.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Config/config.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/SaveAndRestore.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/Watchdog.h"
#include "llvm/Support/raw_ostream.h"
#include <atomic>
#include <cstdarg>
#include <cstdio>
#include <tuple>
#ifdef HAVE_CRASHREPORTERCLIENT_H
#include <CrashReporterClient.h>
#endif
using namespace llvm;
// If backtrace support is not enabled, compile out support for pretty stack
// traces. This has the secondary effect of not requiring thread local storage
// when backtrace support is disabled.
#if ENABLE_BACKTRACES
[LPM] Rip all of ManagedStatic and ThreadLocal out of the pretty stack tracing code. Managed static was just insane overhead for this. We took memory fences and external function calls in every path that pushed a pretty stack frame. This includes a multitude of layers setting up and tearing down passes, the parser in Clang, everywhere. For the regression test suite or low-overhead JITs, this was contributing to really significant overhead. Even the LLVM ThreadLocal is really overkill here because it uses pthread_{set,get}_specific logic, and has careful code to both allocate and delete the thread local data. We don't actually want any of that, and this code in particular has problems coping with deallocation. What we want is a single TLS pointer that is valid to use during global construction and during global destruction, any time we want. That is exactly what every host compiler and OS we use has implemented for a long time, and what was standardized in C++11. Even though not all of our host compilers support the thread_local keyword, we can directly use the platform-specific keywords to get the minimal functionality needed. Provided this limited trial survives the build bots, I will move this to Compiler.h so it is more widely available as a light weight if limited alternative to the ThreadLocal class. Many thanks to David Majnemer for helping me think through the implications across platforms and craft the MSVC-compatible syntax. The end result is *substantially* faster. When running llc in a tight loop over a small IR file targeting the aarch64 backend, this improves its performance by over 10% for me. It also seems likely to fix the remaining regressions seen by JIT users with threading enabled. This may actually have more impact on real-world compile times due to the use of the pretty stack tracing utility throughout the rest of Clang or LLVM, but I've not collected any detailed measurements. llvm-svn: 227300
2015-01-28 17:52:14 +08:00
// We need a thread local pointer to manage the stack of our stack trace
// objects, but we *really* cannot tolerate destructors running and do not want
// to pay any overhead of synchronizing. As a consequence, we use a raw
// thread-local variable.
static LLVM_THREAD_LOCAL PrettyStackTraceEntry *PrettyStackTraceHead = nullptr;
// The use of 'volatile' here is to ensure that any particular thread always
// reloads the value of the counter. The 'std::atomic' allows us to specify that
// this variable is accessed in an unsychronized way (it's not actually
// synchronizing). This does technically mean that the value may not appear to
// be the same across threads running simultaneously on different CPUs, but in
// practice the worst that will happen is that we won't print a stack trace when
// we could have.
//
// This is initialized to 1 because 0 is used as a sentinel for "not enabled on
// the current thread". If the user happens to overflow an 'unsigned' with
// SIGINFO requests, it's possible that some threads will stop responding to it,
// but the program won't crash.
static volatile std::atomic<unsigned> GlobalSigInfoGenerationCounter =
ATOMIC_VAR_INIT(1);
static LLVM_THREAD_LOCAL unsigned ThreadLocalSigInfoGenerationCounter = 0;
namespace llvm {
PrettyStackTraceEntry *ReverseStackTrace(PrettyStackTraceEntry *Head) {
PrettyStackTraceEntry *Prev = nullptr;
while (Head)
std::tie(Prev, Head, Head->NextEntry) =
std::make_tuple(Head, Head->NextEntry, Prev);
return Prev;
}
}
static void PrintStack(raw_ostream &OS) {
// Print out the stack in reverse order. To avoid recursion (which is likely
// to fail if we crashed due to stack overflow), we do an up-front pass to
// reverse the stack, then print it, then reverse it again.
unsigned ID = 0;
SaveAndRestore<PrettyStackTraceEntry *> SavedStack{PrettyStackTraceHead,
nullptr};
PrettyStackTraceEntry *ReversedStack = ReverseStackTrace(SavedStack.get());
for (const PrettyStackTraceEntry *Entry = ReversedStack; Entry;
Entry = Entry->getNextEntry()) {
OS << ID++ << ".\t";
sys::Watchdog W(5);
Entry->print(OS);
}
llvm::ReverseStackTrace(ReversedStack);
}
/// Print the current stack trace to the specified stream.
///
/// Marked NOINLINE so it can be called from debuggers.
LLVM_ATTRIBUTE_NOINLINE
static void PrintCurStackTrace(raw_ostream &OS) {
// Don't print an empty trace.
[LPM] Rip all of ManagedStatic and ThreadLocal out of the pretty stack tracing code. Managed static was just insane overhead for this. We took memory fences and external function calls in every path that pushed a pretty stack frame. This includes a multitude of layers setting up and tearing down passes, the parser in Clang, everywhere. For the regression test suite or low-overhead JITs, this was contributing to really significant overhead. Even the LLVM ThreadLocal is really overkill here because it uses pthread_{set,get}_specific logic, and has careful code to both allocate and delete the thread local data. We don't actually want any of that, and this code in particular has problems coping with deallocation. What we want is a single TLS pointer that is valid to use during global construction and during global destruction, any time we want. That is exactly what every host compiler and OS we use has implemented for a long time, and what was standardized in C++11. Even though not all of our host compilers support the thread_local keyword, we can directly use the platform-specific keywords to get the minimal functionality needed. Provided this limited trial survives the build bots, I will move this to Compiler.h so it is more widely available as a light weight if limited alternative to the ThreadLocal class. Many thanks to David Majnemer for helping me think through the implications across platforms and craft the MSVC-compatible syntax. The end result is *substantially* faster. When running llc in a tight loop over a small IR file targeting the aarch64 backend, this improves its performance by over 10% for me. It also seems likely to fix the remaining regressions seen by JIT users with threading enabled. This may actually have more impact on real-world compile times due to the use of the pretty stack tracing utility throughout the rest of Clang or LLVM, but I've not collected any detailed measurements. llvm-svn: 227300
2015-01-28 17:52:14 +08:00
if (!PrettyStackTraceHead) return;
// If there are pretty stack frames registered, walk and emit them.
OS << "Stack dump:\n";
PrintStack(OS);
OS.flush();
}
// Integrate with crash reporter libraries.
#if defined (__APPLE__) && defined(HAVE_CRASHREPORTERCLIENT_H)
// If any clients of llvm try to link to libCrashReporterClient.a themselves,
// only one crash info struct will be used.
2010-06-29 02:33:48 +08:00
extern "C" {
CRASH_REPORTER_CLIENT_HIDDEN
struct crashreporter_annotations_t gCRAnnotations
__attribute__((section("__DATA," CRASHREPORTER_ANNOTATIONS_SECTION)))
#if CRASHREPORTER_ANNOTATIONS_VERSION < 5
= { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0 };
#else
= { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0, 0 };
#endif
2010-06-29 02:33:48 +08:00
}
#elif defined(__APPLE__) && HAVE_CRASHREPORTER_INFO
extern "C" const char *__crashreporter_info__
__attribute__((visibility("hidden"))) = 0;
asm(".desc ___crashreporter_info__, 0x10");
#endif
static void setCrashLogMessage(const char *msg) LLVM_ATTRIBUTE_UNUSED;
static void setCrashLogMessage(const char *msg) {
#ifdef HAVE_CRASHREPORTERCLIENT_H
(void)CRSetCrashLogMessage(msg);
#elif HAVE_CRASHREPORTER_INFO
__crashreporter_info__ = msg;
#endif
// Don't reorder subsequent operations: whatever comes after might crash and
// we want the system crash handling to see the message we just set.
std::atomic_signal_fence(std::memory_order_seq_cst);
}
#ifdef __APPLE__
using CrashHandlerString = SmallString<2048>;
using CrashHandlerStringStorage =
std::aligned_storage<sizeof(CrashHandlerString),
alignof(CrashHandlerString)>::type;
static CrashHandlerStringStorage crashHandlerStringStorage;
#endif
/// This callback is run if a fatal signal is delivered to the process, it
/// prints the pretty stack trace.
static void CrashHandler(void *) {
#ifndef __APPLE__
// On non-apple systems, just emit the crash stack trace to stderr.
PrintCurStackTrace(errs());
#else
// Emit the crash stack trace to a SmallString, put it where the system crash
// handling will find it, and also send it to stderr.
//
// The SmallString is fairly large in the hope that we don't allocate (we're
// handling a fatal signal, something is already pretty wrong, allocation
// might not work). Further, we don't use a magic static in case that's also
// borked. We leak any allocation that does occur because the program is about
// to die anyways. This is technically racy if we were handling two fatal
// signals, however if we're in that situation a race is the least of our
// worries.
auto &crashHandlerString =
*new (&crashHandlerStringStorage) CrashHandlerString;
// If we crash while trying to print the stack trace, we still want the system
// crash handling to have some partial information. That'll work out as long
// as the SmallString doesn't allocate. If it does allocate then the system
// crash handling will see some garbage because the inline buffer now contains
// a pointer.
setCrashLogMessage(crashHandlerString.c_str());
{
raw_svector_ostream Stream(crashHandlerString);
PrintCurStackTrace(Stream);
}
if (!crashHandlerString.empty()) {
setCrashLogMessage(crashHandlerString.c_str());
errs() << crashHandlerString.str();
} else
setCrashLogMessage("No crash information.");
#endif
}
static void printForSigInfoIfNeeded() {
unsigned CurrentSigInfoGeneration =
GlobalSigInfoGenerationCounter.load(std::memory_order_relaxed);
if (ThreadLocalSigInfoGenerationCounter == 0 ||
ThreadLocalSigInfoGenerationCounter == CurrentSigInfoGeneration) {
return;
}
PrintCurStackTrace(errs());
ThreadLocalSigInfoGenerationCounter = CurrentSigInfoGeneration;
}
#endif // ENABLE_BACKTRACES
PrettyStackTraceEntry::PrettyStackTraceEntry() {
#if ENABLE_BACKTRACES
// Handle SIGINFO first, because we haven't finished constructing yet.
printForSigInfoIfNeeded();
// Link ourselves.
[LPM] Rip all of ManagedStatic and ThreadLocal out of the pretty stack tracing code. Managed static was just insane overhead for this. We took memory fences and external function calls in every path that pushed a pretty stack frame. This includes a multitude of layers setting up and tearing down passes, the parser in Clang, everywhere. For the regression test suite or low-overhead JITs, this was contributing to really significant overhead. Even the LLVM ThreadLocal is really overkill here because it uses pthread_{set,get}_specific logic, and has careful code to both allocate and delete the thread local data. We don't actually want any of that, and this code in particular has problems coping with deallocation. What we want is a single TLS pointer that is valid to use during global construction and during global destruction, any time we want. That is exactly what every host compiler and OS we use has implemented for a long time, and what was standardized in C++11. Even though not all of our host compilers support the thread_local keyword, we can directly use the platform-specific keywords to get the minimal functionality needed. Provided this limited trial survives the build bots, I will move this to Compiler.h so it is more widely available as a light weight if limited alternative to the ThreadLocal class. Many thanks to David Majnemer for helping me think through the implications across platforms and craft the MSVC-compatible syntax. The end result is *substantially* faster. When running llc in a tight loop over a small IR file targeting the aarch64 backend, this improves its performance by over 10% for me. It also seems likely to fix the remaining regressions seen by JIT users with threading enabled. This may actually have more impact on real-world compile times due to the use of the pretty stack tracing utility throughout the rest of Clang or LLVM, but I've not collected any detailed measurements. llvm-svn: 227300
2015-01-28 17:52:14 +08:00
NextEntry = PrettyStackTraceHead;
PrettyStackTraceHead = this;
#endif
}
PrettyStackTraceEntry::~PrettyStackTraceEntry() {
#if ENABLE_BACKTRACES
[LPM] Rip all of ManagedStatic and ThreadLocal out of the pretty stack tracing code. Managed static was just insane overhead for this. We took memory fences and external function calls in every path that pushed a pretty stack frame. This includes a multitude of layers setting up and tearing down passes, the parser in Clang, everywhere. For the regression test suite or low-overhead JITs, this was contributing to really significant overhead. Even the LLVM ThreadLocal is really overkill here because it uses pthread_{set,get}_specific logic, and has careful code to both allocate and delete the thread local data. We don't actually want any of that, and this code in particular has problems coping with deallocation. What we want is a single TLS pointer that is valid to use during global construction and during global destruction, any time we want. That is exactly what every host compiler and OS we use has implemented for a long time, and what was standardized in C++11. Even though not all of our host compilers support the thread_local keyword, we can directly use the platform-specific keywords to get the minimal functionality needed. Provided this limited trial survives the build bots, I will move this to Compiler.h so it is more widely available as a light weight if limited alternative to the ThreadLocal class. Many thanks to David Majnemer for helping me think through the implications across platforms and craft the MSVC-compatible syntax. The end result is *substantially* faster. When running llc in a tight loop over a small IR file targeting the aarch64 backend, this improves its performance by over 10% for me. It also seems likely to fix the remaining regressions seen by JIT users with threading enabled. This may actually have more impact on real-world compile times due to the use of the pretty stack tracing utility throughout the rest of Clang or LLVM, but I've not collected any detailed measurements. llvm-svn: 227300
2015-01-28 17:52:14 +08:00
assert(PrettyStackTraceHead == this &&
"Pretty stack trace entry destruction is out of order");
PrettyStackTraceHead = NextEntry;
// Handle SIGINFO first, because we already started destructing.
printForSigInfoIfNeeded();
#endif
}
void PrettyStackTraceString::print(raw_ostream &OS) const { OS << Str << "\n"; }
PrettyStackTraceFormat::PrettyStackTraceFormat(const char *Format, ...) {
va_list AP;
va_start(AP, Format);
const int SizeOrError = vsnprintf(nullptr, 0, Format, AP);
va_end(AP);
if (SizeOrError < 0) {
return;
}
const int Size = SizeOrError + 1; // '\0'
Str.resize(Size);
va_start(AP, Format);
vsnprintf(Str.data(), Size, Format, AP);
va_end(AP);
}
void PrettyStackTraceFormat::print(raw_ostream &OS) const { OS << Str << "\n"; }
void PrettyStackTraceProgram::print(raw_ostream &OS) const {
OS << "Program arguments: ";
// Print the argument list.
for (unsigned i = 0, e = ArgC; i != e; ++i)
OS << ArgV[i] << ' ';
OS << '\n';
}
#if ENABLE_BACKTRACES
static bool RegisterCrashPrinter() {
sys::AddSignalHandler(CrashHandler, nullptr);
return false;
}
#endif
void llvm::EnablePrettyStackTrace() {
#if ENABLE_BACKTRACES
// The first time this is called, we register the crash printer.
static bool HandlerRegistered = RegisterCrashPrinter();
(void)HandlerRegistered;
#endif
}
void llvm::EnablePrettyStackTraceOnSigInfoForThisThread(bool ShouldEnable) {
#if ENABLE_BACKTRACES
if (!ShouldEnable) {
ThreadLocalSigInfoGenerationCounter = 0;
return;
}
// The first time this is called, we register the SIGINFO handler.
static bool HandlerRegistered = []{
sys::SetInfoSignalFunction([]{
GlobalSigInfoGenerationCounter.fetch_add(1, std::memory_order_relaxed);
});
return false;
}();
(void)HandlerRegistered;
// Next, enable it for the current thread.
ThreadLocalSigInfoGenerationCounter =
GlobalSigInfoGenerationCounter.load(std::memory_order_relaxed);
#endif
}
const void *llvm::SavePrettyStackState() {
#if ENABLE_BACKTRACES
return PrettyStackTraceHead;
#else
return nullptr;
#endif
}
void llvm::RestorePrettyStackState(const void *Top) {
#if ENABLE_BACKTRACES
PrettyStackTraceHead =
static_cast<PrettyStackTraceEntry *>(const_cast<void *>(Top));
#endif
}
void LLVMEnablePrettyStackTrace() {
EnablePrettyStackTrace();
}