Remove dl_iterate_phdr results caching that is used by slow task profiler, instead favoring disabling and reenabling profiling around the call. Add a mechanism to handle deferred profile requests.
This commit is contained in:
parent
8afab93e29
commit
6a899ddff3
|
@ -26,7 +26,6 @@
|
|||
#include "flow/serialize.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/genericactors.actor.h"
|
||||
#include "flow/SignalSafeUnwind.h"
|
||||
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbclient/BackupAgent.actor.h"
|
||||
|
@ -2454,7 +2453,6 @@ extern uint8_t *g_extra_memory;
|
|||
|
||||
int main(int argc, char* argv[]) {
|
||||
platformInit();
|
||||
initSignalSafeUnwind();
|
||||
|
||||
int status = FDB_EXIT_SUCCESS;
|
||||
|
||||
|
|
|
@ -32,7 +32,6 @@
|
|||
#include "fdbclient/FDBOptions.g.h"
|
||||
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "flow/SignalSafeUnwind.h"
|
||||
#include "fdbrpc/TLSConnection.h"
|
||||
#include "fdbrpc/Platform.h"
|
||||
|
||||
|
@ -3457,7 +3456,6 @@ ACTOR Future<Void> timeExit(double duration) {
|
|||
|
||||
int main(int argc, char **argv) {
|
||||
platformInit();
|
||||
initSignalSafeUnwind();
|
||||
Error::init();
|
||||
std::set_new_handler( &platform::outOfMemory );
|
||||
uint64_t memLimit = 8LL << 30;
|
||||
|
|
|
@ -57,7 +57,6 @@
|
|||
#include "fdbrpc/Platform.h"
|
||||
#include "fdbrpc/AsyncFileCached.actor.h"
|
||||
#include "fdbserver/CoroFlow.h"
|
||||
#include "flow/SignalSafeUnwind.h"
|
||||
#if defined(CMAKE_BUILD) || !defined(WIN32)
|
||||
#include "versions.h"
|
||||
#endif
|
||||
|
@ -870,7 +869,6 @@ std::pair<NetworkAddressList, NetworkAddressList> buildNetworkAddresses(const Cl
|
|||
int main(int argc, char* argv[]) {
|
||||
try {
|
||||
platformInit();
|
||||
initSignalSafeUnwind();
|
||||
|
||||
#ifdef ALLOC_INSTRUMENTATION
|
||||
g_extra_memory = new uint8_t[1000000];
|
||||
|
|
|
@ -36,6 +36,7 @@ struct SlowTaskWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
virtual Future<Void> start(Database const& cx) {
|
||||
setupSlowTaskProfiler();
|
||||
return go();
|
||||
}
|
||||
|
||||
|
@ -49,6 +50,9 @@ struct SlowTaskWorkload : TestWorkload {
|
|||
ACTOR static Future<Void> go() {
|
||||
wait( delay(1) );
|
||||
int64_t phc = dl_iterate_phdr_calls;
|
||||
int64_t startProfilesDeferred = getNumProfilesDeferred();
|
||||
int64_t startProfilesOverflowed = getNumProfilesOverflowed();
|
||||
int64_t startProfilesCaptured = getNumProfilesCaptured();
|
||||
int64_t exc = 0;
|
||||
fprintf(stderr, "Slow task starting\n");
|
||||
for(int i=0; i<10; i++) {
|
||||
|
@ -58,7 +62,12 @@ struct SlowTaskWorkload : TestWorkload {
|
|||
do_slow_exception_thing(&exc);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Slow task complete: %" PRId64 " exceptions; %" PRId64 " calls to dl_iterate_phdr\n", exc, dl_iterate_phdr_calls - phc);
|
||||
fprintf(stderr, "Slow task complete: %" PRId64 " exceptions; %" PRId64 " calls to dl_iterate_phdr, %" PRId64 " profiles deferred, %" PRId64 " profiles overflowed, %" PRId64 " profiles captured\n",
|
||||
exc, dl_iterate_phdr_calls - phc,
|
||||
getNumProfilesDeferred() - startProfilesDeferred,
|
||||
getNumProfilesOverflowed() - startProfilesOverflowed,
|
||||
getNumProfilesCaptured() - startProfilesCaptured);
|
||||
|
||||
return Void();
|
||||
}
|
||||
|
||||
|
|
|
@ -2802,39 +2802,59 @@ extern volatile void** net2backtraces;
|
|||
extern volatile size_t net2backtraces_offset;
|
||||
extern volatile size_t net2backtraces_max;
|
||||
extern volatile bool net2backtraces_overflow;
|
||||
extern volatile int net2backtraces_count;
|
||||
extern volatile int64_t net2backtraces_count;
|
||||
extern std::atomic<int64_t> net2liveness;
|
||||
extern volatile thread_local int profilingEnabled;
|
||||
extern void initProfiling();
|
||||
|
||||
volatile thread_local bool profileThread = false;
|
||||
#endif
|
||||
|
||||
volatile thread_local bool profileThread = false;
|
||||
volatile thread_local int profilingEnabled = 1;
|
||||
|
||||
void setProfilingEnabled(int enabled) {
|
||||
profilingEnabled = enabled;
|
||||
volatile thread_local int64_t numProfilesDeferred = 0;
|
||||
volatile thread_local int64_t numProfilesOverflowed = 0;
|
||||
volatile thread_local int64_t numProfilesCaptured = 0;
|
||||
volatile thread_local bool profileRequested = false;
|
||||
|
||||
int64_t getNumProfilesDeferred() {
|
||||
return numProfilesDeferred;
|
||||
}
|
||||
|
||||
int64_t getNumProfilesOverflowed() {
|
||||
return numProfilesOverflowed;
|
||||
}
|
||||
|
||||
int64_t getNumProfilesCaptured() {
|
||||
return numProfilesCaptured;
|
||||
}
|
||||
|
||||
void profileHandler(int sig) {
|
||||
#ifdef __linux__
|
||||
if (!profileThread || !profilingEnabled) {
|
||||
if(!profileThread) {
|
||||
return;
|
||||
}
|
||||
|
||||
net2backtraces_count++;
|
||||
if(!profilingEnabled) {
|
||||
profileRequested = true;
|
||||
++numProfilesDeferred;
|
||||
return;
|
||||
}
|
||||
|
||||
++net2backtraces_count;
|
||||
|
||||
if (!net2backtraces || net2backtraces_max - net2backtraces_offset < 50) {
|
||||
++numProfilesOverflowed;
|
||||
net2backtraces_overflow = true;
|
||||
return;
|
||||
}
|
||||
|
||||
++numProfilesCaptured;
|
||||
|
||||
// We are casting away the volatile-ness of the backtrace array, but we believe that should be reasonably safe in the signal handler
|
||||
ProfilingSample* ps = const_cast<ProfilingSample*>((volatile ProfilingSample*)(net2backtraces + net2backtraces_offset));
|
||||
|
||||
ps->timestamp = timer();
|
||||
|
||||
// SOMEDAY: should we limit the maximum number of frames from
|
||||
// backtrace beyond just available space?
|
||||
// SOMEDAY: should we limit the maximum number of frames from backtrace beyond just available space?
|
||||
size_t size = backtrace(ps->frames, net2backtraces_max - net2backtraces_offset - 2);
|
||||
|
||||
ps->length = size;
|
||||
|
@ -2845,6 +2865,17 @@ void profileHandler(int sig) {
|
|||
#endif
|
||||
}
|
||||
|
||||
void setProfilingEnabled(int enabled) {
|
||||
if(profileThread && enabled && !profilingEnabled && profileRequested) {
|
||||
profilingEnabled = true;
|
||||
profileRequested = false;
|
||||
pthread_kill(pthread_self(), SIGPROF);
|
||||
}
|
||||
else {
|
||||
profilingEnabled = enabled;
|
||||
}
|
||||
}
|
||||
|
||||
void* checkThread(void *arg) {
|
||||
#ifdef __linux__
|
||||
pthread_t mainThread = *(pthread_t*)arg;
|
||||
|
@ -2882,7 +2913,7 @@ void* checkThread(void *arg) {
|
|||
|
||||
void setupSlowTaskProfiler() {
|
||||
#ifdef __linux__
|
||||
if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
|
||||
if (!profileThread && FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
|
||||
TraceEvent("StartingSlowTaskProfilingThread").detail("Interval", FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
|
||||
initProfiling();
|
||||
profileThread = true;
|
||||
|
|
|
@ -617,6 +617,11 @@ void registerCrashHandler();
|
|||
void setupSlowTaskProfiler();
|
||||
EXTERNC void setProfilingEnabled(int enabled);
|
||||
|
||||
// These return thread local counts
|
||||
int64_t getNumProfilesDeferred();
|
||||
int64_t getNumProfilesOverflowed();
|
||||
int64_t getNumProfilesCaptured();
|
||||
|
||||
// Use _exit() or criticalError(), not exit()
|
||||
#define CALLS_TO_EXIT_ARE_FORBIDDEN_BY_POLICY() [====]
|
||||
#define exit CALLS_TO_EXIT_ARE_FORBIDDEN_BY_POLICY(0)
|
||||
|
|
|
@ -27,17 +27,9 @@ int64_t dl_iterate_phdr_calls = 0;
|
|||
#include <link.h>
|
||||
#include <mutex>
|
||||
|
||||
static bool phdr_cache_initialized = false;
|
||||
static std::vector< std::vector<uint8_t> > phdr_cache;
|
||||
|
||||
static int (*chain_dl_iterate_phdr)(
|
||||
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
|
||||
void *data) = nullptr;
|
||||
|
||||
static int phdr_cache_add( struct dl_phdr_info *info, size_t size, void *data ) {
|
||||
phdr_cache.push_back( std::vector<uint8_t>((uint8_t*)info, (uint8_t*)info + size) );
|
||||
return 0;
|
||||
}
|
||||
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
|
||||
void *data) = nullptr;
|
||||
|
||||
static void initChain() {
|
||||
static std::once_flag flag;
|
||||
|
@ -50,15 +42,6 @@ static void initChain() {
|
|||
}
|
||||
}
|
||||
|
||||
void initSignalSafeUnwind() {
|
||||
initChain();
|
||||
|
||||
phdr_cache.clear();
|
||||
if (chain_dl_iterate_phdr(&phdr_cache_add, 0))
|
||||
criticalError(FDB_EXIT_ERROR, "DLIterateError", "dl_iterate_phdr error");
|
||||
phdr_cache_initialized = true;
|
||||
}
|
||||
|
||||
// This overrides the function in libc!
|
||||
extern "C" int dl_iterate_phdr(
|
||||
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
|
||||
|
@ -66,29 +49,11 @@ extern "C" int dl_iterate_phdr(
|
|||
{
|
||||
interlockedIncrement64(&dl_iterate_phdr_calls);
|
||||
|
||||
if (phdr_cache_initialized)
|
||||
{
|
||||
// This path should be async signal safe
|
||||
for(int i=0; i<phdr_cache.size(); i++)
|
||||
{
|
||||
int r = callback( (struct dl_phdr_info*)&phdr_cache[i][0], phdr_cache[i].size(), data );
|
||||
if (r!=0)
|
||||
return r;
|
||||
}
|
||||
return 0;
|
||||
} else {
|
||||
// This path is NOT async signal safe, and serves until and unless initSignalSafeUnwind() is called
|
||||
initChain();
|
||||
initChain();
|
||||
|
||||
setProfilingEnabled(0);
|
||||
int result = chain_dl_iterate_phdr(callback, data);
|
||||
setProfilingEnabled(1);
|
||||
return result;
|
||||
}
|
||||
setProfilingEnabled(0);
|
||||
int result = chain_dl_iterate_phdr(callback, data);
|
||||
setProfilingEnabled(1);
|
||||
return result;
|
||||
}
|
||||
|
||||
#else // __linux__
|
||||
|
||||
void initSignalSafeUnwind() {}
|
||||
|
||||
#endif
|
|
@ -24,20 +24,6 @@
|
|||
|
||||
#include "flow/Platform.h"
|
||||
|
||||
|
||||
// backtrace() and exception unwinding in glibc both call dl_iterate_phdr(),
|
||||
// which takes the loader lock and so is not async signal safe. Profiling or slow task
|
||||
// profiling can deadlock when they interrupt the unwinding of an exception.
|
||||
|
||||
// This library overrides the implementation of dl_iterate_phdr() so that it
|
||||
// can be async signal safe in this context, at the cost of other restrictions
|
||||
|
||||
// Call this function after all dynamic libraries are loaded
|
||||
// (no further calls to dlopen() or dlclose() are permitted).
|
||||
// After calling it, dl_iterate_phdr() will be async-signal-safe.
|
||||
// At this time, it is a no-op on all platforms except Linux
|
||||
void initSignalSafeUnwind();
|
||||
|
||||
// This can be used by tests to measure the number of calls to dl_iterate_phdr intercepted
|
||||
extern int64_t dl_iterate_phdr_calls;
|
||||
|
||||
|
|
Loading…
Reference in New Issue