Merge pull request #5768 from sfc-gh-ajbeamon/improved-logging-failed-network-thread

Add some additional logging if the network thread finishes, fails with an error, gets stopped, or is blocked
This commit is contained in:
Markus Pilman 2021-10-19 13:10:16 -06:00 committed by GitHub
commit 9730f670e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 39 additions and 2 deletions

View File

@ -538,6 +538,8 @@ void DLApi::runNetwork() {
hook.first(hook.second);
} catch (Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
}
@ -1813,9 +1815,14 @@ THREAD_FUNC_RETURN runNetworkThread(void* param) {
try {
((ClientInfo*)param)->api->runNetwork();
} catch (Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e);
TraceEvent(SevError, "ExternalRunNetworkError").error(e);
} catch (std::exception& e) {
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error());
}
TraceEvent("ExternalNetworkThreadTerminating");
THREAD_RETURN;
}
@ -1852,6 +1859,7 @@ void MultiVersionApi::stopNetwork() {
}
lock.leave();
TraceEvent("MultiVersionStopNetwork");
localClient->api->stopNetwork();
if (!bypassMultiClientApi) {

View File

@ -2164,6 +2164,7 @@ void stopNetwork() {
if (!g_network)
throw network_not_setup();
TraceEvent("ClientStopNetwork");
g_network->stop();
closeTraceFile();
}

View File

@ -443,7 +443,14 @@ void ThreadSafeApi::runNetwork() {
try {
::runNetwork();
} catch (Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e;
} catch (std::exception& e) {
runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error());
}
for (auto& hook : threadCompletionHooks) {
@ -451,6 +458,8 @@ void ThreadSafeApi::runNetwork() {
hook.first(hook.second);
} catch (Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
}
@ -459,6 +468,8 @@ void ThreadSafeApi::runNetwork() {
if (runErr.present()) {
throw runErr.get();
}
TraceEvent("RunNetworkTerminating");
}
void ThreadSafeApi::stopNetwork() {

View File

@ -57,6 +57,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
init( SLOWTASK_PROFILING_LOG_INTERVAL, 0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL, 1.0 );
init( SLOWTASK_PROFILING_LOG_BACKOFF, 2.0 );
init( SLOWTASK_BLOCKED_INTERVAL, 60.0 );
init( SATURATION_PROFILING_LOG_INTERVAL, 0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
init( SATURATION_PROFILING_MAX_LOG_INTERVAL, 5.0 );
init( SATURATION_PROFILING_LOG_BACKOFF, 2.0 );

View File

@ -125,6 +125,7 @@ public:
double SLOWTASK_PROFILING_LOG_INTERVAL;
double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
double SLOWTASK_PROFILING_LOG_BACKOFF;
double SLOWTASK_BLOCKED_INTERVAL;
double SATURATION_PROFILING_LOG_INTERVAL;
double SATURATION_PROFILING_MAX_LOG_INTERVAL;
double SATURATION_PROFILING_LOG_BACKOFF;

View File

@ -3581,8 +3581,10 @@ void* checkThread(void* arg) {
int64_t lastRunLoopIterations = net2RunLoopIterations.load();
int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
double slowTaskStart = 0;
double lastSlowTaskSignal = 0;
double lastSaturatedSignal = 0;
double lastSlowTaskBlockedLog = 0;
const double minSlowTaskLogInterval =
std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
@ -3603,7 +3605,19 @@ void* checkThread(void* arg) {
if (slowTask) {
double t = timer();
if (lastSlowTaskSignal == 0 || t - lastSlowTaskSignal >= slowTaskLogInterval) {
bool newSlowTask = lastSlowTaskSignal == 0;
if (newSlowTask) {
slowTaskStart = t;
} else if (t - std::max(slowTaskStart, lastSlowTaskBlockedLog) > FLOW_KNOBS->SLOWTASK_BLOCKED_INTERVAL) {
lastSlowTaskBlockedLog = t;
// When this gets logged, it will be with a current timestamp (using timer()). If the network thread
// unblocks, it will log any slow task related events at an earlier timestamp. That means the order of
// events during this sequence will not match their timestamp order.
TraceEvent(SevWarnAlways, "RunLoopBlocked").detail("Duration", t - slowTaskStart);
}
if (newSlowTask || t - lastSlowTaskSignal >= slowTaskLogInterval) {
if (lastSlowTaskSignal > 0) {
slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL,
FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
@ -3614,6 +3628,7 @@ void* checkThread(void* arg) {
pthread_kill(mainThread, SIGPROF);
}
} else {
slowTaskStart = 0;
lastSlowTaskSignal = 0;
lastRunLoopIterations = currentRunLoopIterations;
slowTaskLogInterval = minSlowTaskLogInterval;