Merge pull request #5768 from sfc-gh-ajbeamon/improved-logging-failed-network-thread
Add some additional logging if the network thread finishes, fails with an error, gets stopped, or is blocked
This commit is contained in:
commit
9730f670e1
|
@ -538,6 +538,8 @@ void DLApi::runNetwork() {
|
||||||
hook.first(hook.second);
|
hook.first(hook.second);
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
||||||
|
} catch (std::exception& e) {
|
||||||
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
||||||
}
|
}
|
||||||
|
@ -1813,9 +1815,14 @@ THREAD_FUNC_RETURN runNetworkThread(void* param) {
|
||||||
try {
|
try {
|
||||||
((ClientInfo*)param)->api->runNetwork();
|
((ClientInfo*)param)->api->runNetwork();
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
TraceEvent(SevError, "RunNetworkError").error(e);
|
TraceEvent(SevError, "ExternalRunNetworkError").error(e);
|
||||||
|
} catch (std::exception& e) {
|
||||||
|
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
||||||
|
} catch (...) {
|
||||||
|
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TraceEvent("ExternalNetworkThreadTerminating");
|
||||||
THREAD_RETURN;
|
THREAD_RETURN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1852,6 +1859,7 @@ void MultiVersionApi::stopNetwork() {
|
||||||
}
|
}
|
||||||
lock.leave();
|
lock.leave();
|
||||||
|
|
||||||
|
TraceEvent("MultiVersionStopNetwork");
|
||||||
localClient->api->stopNetwork();
|
localClient->api->stopNetwork();
|
||||||
|
|
||||||
if (!bypassMultiClientApi) {
|
if (!bypassMultiClientApi) {
|
||||||
|
|
|
@ -2164,6 +2164,7 @@ void stopNetwork() {
|
||||||
if (!g_network)
|
if (!g_network)
|
||||||
throw network_not_setup();
|
throw network_not_setup();
|
||||||
|
|
||||||
|
TraceEvent("ClientStopNetwork");
|
||||||
g_network->stop();
|
g_network->stop();
|
||||||
closeTraceFile();
|
closeTraceFile();
|
||||||
}
|
}
|
||||||
|
|
|
@ -443,7 +443,14 @@ void ThreadSafeApi::runNetwork() {
|
||||||
try {
|
try {
|
||||||
::runNetwork();
|
::runNetwork();
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
|
TraceEvent(SevError, "RunNetworkError").error(e);
|
||||||
runErr = e;
|
runErr = e;
|
||||||
|
} catch (std::exception& e) {
|
||||||
|
runErr = unknown_error();
|
||||||
|
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
||||||
|
} catch (...) {
|
||||||
|
runErr = unknown_error();
|
||||||
|
TraceEvent(SevError, "RunNetworkError").error(unknown_error());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& hook : threadCompletionHooks) {
|
for (auto& hook : threadCompletionHooks) {
|
||||||
|
@ -451,6 +458,8 @@ void ThreadSafeApi::runNetwork() {
|
||||||
hook.first(hook.second);
|
hook.first(hook.second);
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
||||||
|
} catch (std::exception& e) {
|
||||||
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
||||||
}
|
}
|
||||||
|
@ -459,6 +468,8 @@ void ThreadSafeApi::runNetwork() {
|
||||||
if (runErr.present()) {
|
if (runErr.present()) {
|
||||||
throw runErr.get();
|
throw runErr.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TraceEvent("RunNetworkTerminating");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadSafeApi::stopNetwork() {
|
void ThreadSafeApi::stopNetwork() {
|
||||||
|
|
|
@ -57,6 +57,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
|
||||||
init( SLOWTASK_PROFILING_LOG_INTERVAL, 0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
init( SLOWTASK_PROFILING_LOG_INTERVAL, 0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
||||||
init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL, 1.0 );
|
init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL, 1.0 );
|
||||||
init( SLOWTASK_PROFILING_LOG_BACKOFF, 2.0 );
|
init( SLOWTASK_PROFILING_LOG_BACKOFF, 2.0 );
|
||||||
|
init( SLOWTASK_BLOCKED_INTERVAL, 60.0 );
|
||||||
init( SATURATION_PROFILING_LOG_INTERVAL, 0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
init( SATURATION_PROFILING_LOG_INTERVAL, 0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
||||||
init( SATURATION_PROFILING_MAX_LOG_INTERVAL, 5.0 );
|
init( SATURATION_PROFILING_MAX_LOG_INTERVAL, 5.0 );
|
||||||
init( SATURATION_PROFILING_LOG_BACKOFF, 2.0 );
|
init( SATURATION_PROFILING_LOG_BACKOFF, 2.0 );
|
||||||
|
|
|
@ -125,6 +125,7 @@ public:
|
||||||
double SLOWTASK_PROFILING_LOG_INTERVAL;
|
double SLOWTASK_PROFILING_LOG_INTERVAL;
|
||||||
double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
|
double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
|
||||||
double SLOWTASK_PROFILING_LOG_BACKOFF;
|
double SLOWTASK_PROFILING_LOG_BACKOFF;
|
||||||
|
double SLOWTASK_BLOCKED_INTERVAL;
|
||||||
double SATURATION_PROFILING_LOG_INTERVAL;
|
double SATURATION_PROFILING_LOG_INTERVAL;
|
||||||
double SATURATION_PROFILING_MAX_LOG_INTERVAL;
|
double SATURATION_PROFILING_MAX_LOG_INTERVAL;
|
||||||
double SATURATION_PROFILING_LOG_BACKOFF;
|
double SATURATION_PROFILING_LOG_BACKOFF;
|
||||||
|
|
|
@ -3581,8 +3581,10 @@ void* checkThread(void* arg) {
|
||||||
int64_t lastRunLoopIterations = net2RunLoopIterations.load();
|
int64_t lastRunLoopIterations = net2RunLoopIterations.load();
|
||||||
int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
|
int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
|
||||||
|
|
||||||
|
double slowTaskStart = 0;
|
||||||
double lastSlowTaskSignal = 0;
|
double lastSlowTaskSignal = 0;
|
||||||
double lastSaturatedSignal = 0;
|
double lastSaturatedSignal = 0;
|
||||||
|
double lastSlowTaskBlockedLog = 0;
|
||||||
|
|
||||||
const double minSlowTaskLogInterval =
|
const double minSlowTaskLogInterval =
|
||||||
std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
|
std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
|
||||||
|
@ -3603,7 +3605,19 @@ void* checkThread(void* arg) {
|
||||||
|
|
||||||
if (slowTask) {
|
if (slowTask) {
|
||||||
double t = timer();
|
double t = timer();
|
||||||
if (lastSlowTaskSignal == 0 || t - lastSlowTaskSignal >= slowTaskLogInterval) {
|
bool newSlowTask = lastSlowTaskSignal == 0;
|
||||||
|
|
||||||
|
if (newSlowTask) {
|
||||||
|
slowTaskStart = t;
|
||||||
|
} else if (t - std::max(slowTaskStart, lastSlowTaskBlockedLog) > FLOW_KNOBS->SLOWTASK_BLOCKED_INTERVAL) {
|
||||||
|
lastSlowTaskBlockedLog = t;
|
||||||
|
// When this gets logged, it will be with a current timestamp (using timer()). If the network thread
|
||||||
|
// unblocks, it will log any slow task related events at an earlier timestamp. That means the order of
|
||||||
|
// events during this sequence will not match their timestamp order.
|
||||||
|
TraceEvent(SevWarnAlways, "RunLoopBlocked").detail("Duration", t - slowTaskStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newSlowTask || t - lastSlowTaskSignal >= slowTaskLogInterval) {
|
||||||
if (lastSlowTaskSignal > 0) {
|
if (lastSlowTaskSignal > 0) {
|
||||||
slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL,
|
slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL,
|
||||||
FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
|
FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
|
||||||
|
@ -3614,6 +3628,7 @@ void* checkThread(void* arg) {
|
||||||
pthread_kill(mainThread, SIGPROF);
|
pthread_kill(mainThread, SIGPROF);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
slowTaskStart = 0;
|
||||||
lastSlowTaskSignal = 0;
|
lastSlowTaskSignal = 0;
|
||||||
lastRunLoopIterations = currentRunLoopIterations;
|
lastRunLoopIterations = currentRunLoopIterations;
|
||||||
slowTaskLogInterval = minSlowTaskLogInterval;
|
slowTaskLogInterval = minSlowTaskLogInterval;
|
||||||
|
|
Loading…
Reference in New Issue