Remove dl_iterate_phdr results caching that is used by slow task profiler, instead favoring disabling and reenabling profiling around the call. Add a mechanism to handle deferred profile requests.

This commit is contained in:
A.J. Beamon 2019-07-03 12:48:36 -07:00
parent 8afab93e29
commit 6a899ddff3
8 changed files with 64 additions and 74 deletions

View File

@ -26,7 +26,6 @@
#include "flow/serialize.h"
#include "flow/IRandom.h"
#include "flow/genericactors.actor.h"
#include "flow/SignalSafeUnwind.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/BackupAgent.actor.h"
@ -2454,7 +2453,6 @@ extern uint8_t *g_extra_memory;
int main(int argc, char* argv[]) {
platformInit();
initSignalSafeUnwind();
int status = FDB_EXIT_SUCCESS;

View File

@ -32,7 +32,6 @@
#include "fdbclient/FDBOptions.g.h"
#include "flow/DeterministicRandom.h"
#include "flow/SignalSafeUnwind.h"
#include "fdbrpc/TLSConnection.h"
#include "fdbrpc/Platform.h"
@ -3457,7 +3456,6 @@ ACTOR Future<Void> timeExit(double duration) {
int main(int argc, char **argv) {
platformInit();
initSignalSafeUnwind();
Error::init();
std::set_new_handler( &platform::outOfMemory );
uint64_t memLimit = 8LL << 30;

View File

@ -57,7 +57,6 @@
#include "fdbrpc/Platform.h"
#include "fdbrpc/AsyncFileCached.actor.h"
#include "fdbserver/CoroFlow.h"
#include "flow/SignalSafeUnwind.h"
#if defined(CMAKE_BUILD) || !defined(WIN32)
#include "versions.h"
#endif
@ -870,7 +869,6 @@ std::pair<NetworkAddressList, NetworkAddressList> buildNetworkAddresses(const Cl
int main(int argc, char* argv[]) {
try {
platformInit();
initSignalSafeUnwind();
#ifdef ALLOC_INSTRUMENTATION
g_extra_memory = new uint8_t[1000000];

View File

@ -36,6 +36,7 @@ struct SlowTaskWorkload : TestWorkload {
}
virtual Future<Void> start(Database const& cx) {
setupSlowTaskProfiler();
return go();
}
@ -49,6 +50,9 @@ struct SlowTaskWorkload : TestWorkload {
ACTOR static Future<Void> go() {
wait( delay(1) );
int64_t phc = dl_iterate_phdr_calls;
int64_t startProfilesDeferred = getNumProfilesDeferred();
int64_t startProfilesOverflowed = getNumProfilesOverflowed();
int64_t startProfilesCaptured = getNumProfilesCaptured();
int64_t exc = 0;
fprintf(stderr, "Slow task starting\n");
for(int i=0; i<10; i++) {
@ -58,7 +62,12 @@ struct SlowTaskWorkload : TestWorkload {
do_slow_exception_thing(&exc);
}
}
fprintf(stderr, "Slow task complete: %" PRId64 " exceptions; %" PRId64 " calls to dl_iterate_phdr\n", exc, dl_iterate_phdr_calls - phc);
fprintf(stderr, "Slow task complete: %" PRId64 " exceptions; %" PRId64 " calls to dl_iterate_phdr, %" PRId64 " profiles deferred, %" PRId64 " profiles overflowed, %" PRId64 " profiles captured\n",
exc, dl_iterate_phdr_calls - phc,
getNumProfilesDeferred() - startProfilesDeferred,
getNumProfilesOverflowed() - startProfilesOverflowed,
getNumProfilesCaptured() - startProfilesCaptured);
return Void();
}

View File

@ -2802,39 +2802,59 @@ extern volatile void** net2backtraces;
extern volatile size_t net2backtraces_offset;
extern volatile size_t net2backtraces_max;
extern volatile bool net2backtraces_overflow;
extern volatile int net2backtraces_count;
extern volatile int64_t net2backtraces_count;
extern std::atomic<int64_t> net2liveness;
extern volatile thread_local int profilingEnabled;
extern void initProfiling();
volatile thread_local bool profileThread = false;
#endif
volatile thread_local bool profileThread = false;
volatile thread_local int profilingEnabled = 1;
void setProfilingEnabled(int enabled) {
profilingEnabled = enabled;
volatile thread_local int64_t numProfilesDeferred = 0;
volatile thread_local int64_t numProfilesOverflowed = 0;
volatile thread_local int64_t numProfilesCaptured = 0;
volatile thread_local bool profileRequested = false;
int64_t getNumProfilesDeferred() {
return numProfilesDeferred;
}
int64_t getNumProfilesOverflowed() {
return numProfilesOverflowed;
}
int64_t getNumProfilesCaptured() {
return numProfilesCaptured;
}
void profileHandler(int sig) {
#ifdef __linux__
if (!profileThread || !profilingEnabled) {
if(!profileThread) {
return;
}
net2backtraces_count++;
if(!profilingEnabled) {
profileRequested = true;
++numProfilesDeferred;
return;
}
++net2backtraces_count;
if (!net2backtraces || net2backtraces_max - net2backtraces_offset < 50) {
++numProfilesOverflowed;
net2backtraces_overflow = true;
return;
}
++numProfilesCaptured;
// We are casting away the volatile-ness of the backtrace array, but we believe that should be reasonably safe in the signal handler
ProfilingSample* ps = const_cast<ProfilingSample*>((volatile ProfilingSample*)(net2backtraces + net2backtraces_offset));
ps->timestamp = timer();
// SOMEDAY: should we limit the maximum number of frames from
// backtrace beyond just available space?
// SOMEDAY: should we limit the maximum number of frames from backtrace beyond just available space?
size_t size = backtrace(ps->frames, net2backtraces_max - net2backtraces_offset - 2);
ps->length = size;
@ -2845,6 +2865,17 @@ void profileHandler(int sig) {
#endif
}
void setProfilingEnabled(int enabled) {
if(profileThread && enabled && !profilingEnabled && profileRequested) {
profilingEnabled = true;
profileRequested = false;
pthread_kill(pthread_self(), SIGPROF);
}
else {
profilingEnabled = enabled;
}
}
void* checkThread(void *arg) {
#ifdef __linux__
pthread_t mainThread = *(pthread_t*)arg;
@ -2882,7 +2913,7 @@ void* checkThread(void *arg) {
void setupSlowTaskProfiler() {
#ifdef __linux__
if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
if (!profileThread && FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
TraceEvent("StartingSlowTaskProfilingThread").detail("Interval", FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
initProfiling();
profileThread = true;

View File

@ -617,6 +617,11 @@ void registerCrashHandler();
void setupSlowTaskProfiler();
EXTERNC void setProfilingEnabled(int enabled);
// These return thread local counts
int64_t getNumProfilesDeferred();
int64_t getNumProfilesOverflowed();
int64_t getNumProfilesCaptured();
// Use _exit() or criticalError(), not exit()
#define CALLS_TO_EXIT_ARE_FORBIDDEN_BY_POLICY() [====]
#define exit CALLS_TO_EXIT_ARE_FORBIDDEN_BY_POLICY(0)

View File

@ -27,17 +27,9 @@ int64_t dl_iterate_phdr_calls = 0;
#include <link.h>
#include <mutex>
static bool phdr_cache_initialized = false;
static std::vector< std::vector<uint8_t> > phdr_cache;
static int (*chain_dl_iterate_phdr)(
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
void *data) = nullptr;
static int phdr_cache_add( struct dl_phdr_info *info, size_t size, void *data ) {
phdr_cache.push_back( std::vector<uint8_t>((uint8_t*)info, (uint8_t*)info + size) );
return 0;
}
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
void *data) = nullptr;
static void initChain() {
static std::once_flag flag;
@ -50,15 +42,6 @@ static void initChain() {
}
}
void initSignalSafeUnwind() {
initChain();
phdr_cache.clear();
if (chain_dl_iterate_phdr(&phdr_cache_add, 0))
criticalError(FDB_EXIT_ERROR, "DLIterateError", "dl_iterate_phdr error");
phdr_cache_initialized = true;
}
// This overrides the function in libc!
extern "C" int dl_iterate_phdr(
int (*callback) (struct dl_phdr_info *info, size_t size, void *data),
@ -66,29 +49,11 @@ extern "C" int dl_iterate_phdr(
{
interlockedIncrement64(&dl_iterate_phdr_calls);
if (phdr_cache_initialized)
{
// This path should be async signal safe
for(int i=0; i<phdr_cache.size(); i++)
{
int r = callback( (struct dl_phdr_info*)&phdr_cache[i][0], phdr_cache[i].size(), data );
if (r!=0)
return r;
}
return 0;
} else {
// This path is NOT async signal safe, and serves until and unless initSignalSafeUnwind() is called
initChain();
initChain();
setProfilingEnabled(0);
int result = chain_dl_iterate_phdr(callback, data);
setProfilingEnabled(1);
return result;
}
setProfilingEnabled(0);
int result = chain_dl_iterate_phdr(callback, data);
setProfilingEnabled(1);
return result;
}
#else // __linux__
void initSignalSafeUnwind() {}
#endif

View File

@ -24,20 +24,6 @@
#include "flow/Platform.h"
// backtrace() and exception unwinding in glibc both call dl_iterate_phdr(),
// which takes the loader lock and so is not async signal safe. Profiling or slow task
// profiling can deadlock when they interrupt the unwinding of an exception.
// This library overrides the implementation of dl_iterate_phdr() so that it
// can be async signal safe in this context, at the cost of other restrictions
// Call this function after all dynamic libraries are loaded
// (no further calls to dlopen() or dlclose() are permitted).
// After calling it, dl_iterate_phdr() will be async-signal-safe.
// At this time, it is a no-op on all platforms except Linux
void initSignalSafeUnwind();
// This can be used by tests to measure the number of calls to dl_iterate_phdr intercepted
extern int64_t dl_iterate_phdr_calls;