Merge pull request #5768 from sfc-gh-ajbeamon/improved-logging-failed-network-thread
Add some additional logging if the network thread finishes, fails with an error, gets stopped, or is blocked
This commit is contained in:
commit
9730f670e1
|
@ -538,6 +538,8 @@ void DLApi::runNetwork() {
|
|||
hook.first(hook.second);
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
||||
} catch (std::exception& e) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
||||
} catch (...) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
||||
}
|
||||
|
@ -1813,9 +1815,14 @@ THREAD_FUNC_RETURN runNetworkThread(void* param) {
|
|||
try {
|
||||
((ClientInfo*)param)->api->runNetwork();
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevError, "RunNetworkError").error(e);
|
||||
TraceEvent(SevError, "ExternalRunNetworkError").error(e);
|
||||
} catch (std::exception& e) {
|
||||
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
||||
} catch (...) {
|
||||
TraceEvent(SevError, "ExternalRunNetworkError").error(unknown_error());
|
||||
}
|
||||
|
||||
TraceEvent("ExternalNetworkThreadTerminating");
|
||||
THREAD_RETURN;
|
||||
}
|
||||
|
||||
|
@ -1852,6 +1859,7 @@ void MultiVersionApi::stopNetwork() {
|
|||
}
|
||||
lock.leave();
|
||||
|
||||
TraceEvent("MultiVersionStopNetwork");
|
||||
localClient->api->stopNetwork();
|
||||
|
||||
if (!bypassMultiClientApi) {
|
||||
|
|
|
@ -2164,6 +2164,7 @@ void stopNetwork() {
|
|||
if (!g_network)
|
||||
throw network_not_setup();
|
||||
|
||||
TraceEvent("ClientStopNetwork");
|
||||
g_network->stop();
|
||||
closeTraceFile();
|
||||
}
|
||||
|
|
|
@ -443,7 +443,14 @@ void ThreadSafeApi::runNetwork() {
|
|||
try {
|
||||
::runNetwork();
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevError, "RunNetworkError").error(e);
|
||||
runErr = e;
|
||||
} catch (std::exception& e) {
|
||||
runErr = unknown_error();
|
||||
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
||||
} catch (...) {
|
||||
runErr = unknown_error();
|
||||
TraceEvent(SevError, "RunNetworkError").error(unknown_error());
|
||||
}
|
||||
|
||||
for (auto& hook : threadCompletionHooks) {
|
||||
|
@ -451,6 +458,8 @@ void ThreadSafeApi::runNetwork() {
|
|||
hook.first(hook.second);
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
||||
} catch (std::exception& e) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
||||
} catch (...) {
|
||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
||||
}
|
||||
|
@ -459,6 +468,8 @@ void ThreadSafeApi::runNetwork() {
|
|||
if (runErr.present()) {
|
||||
throw runErr.get();
|
||||
}
|
||||
|
||||
TraceEvent("RunNetworkTerminating");
|
||||
}
|
||||
|
||||
void ThreadSafeApi::stopNetwork() {
|
||||
|
|
|
@ -57,6 +57,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
|
|||
init( SLOWTASK_PROFILING_LOG_INTERVAL, 0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
||||
init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL, 1.0 );
|
||||
init( SLOWTASK_PROFILING_LOG_BACKOFF, 2.0 );
|
||||
init( SLOWTASK_BLOCKED_INTERVAL, 60.0 );
|
||||
init( SATURATION_PROFILING_LOG_INTERVAL, 0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
|
||||
init( SATURATION_PROFILING_MAX_LOG_INTERVAL, 5.0 );
|
||||
init( SATURATION_PROFILING_LOG_BACKOFF, 2.0 );
|
||||
|
|
|
@ -125,6 +125,7 @@ public:
|
|||
double SLOWTASK_PROFILING_LOG_INTERVAL;
|
||||
double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
|
||||
double SLOWTASK_PROFILING_LOG_BACKOFF;
|
||||
double SLOWTASK_BLOCKED_INTERVAL;
|
||||
double SATURATION_PROFILING_LOG_INTERVAL;
|
||||
double SATURATION_PROFILING_MAX_LOG_INTERVAL;
|
||||
double SATURATION_PROFILING_LOG_BACKOFF;
|
||||
|
|
|
@ -3581,8 +3581,10 @@ void* checkThread(void* arg) {
|
|||
int64_t lastRunLoopIterations = net2RunLoopIterations.load();
|
||||
int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
|
||||
|
||||
double slowTaskStart = 0;
|
||||
double lastSlowTaskSignal = 0;
|
||||
double lastSaturatedSignal = 0;
|
||||
double lastSlowTaskBlockedLog = 0;
|
||||
|
||||
const double minSlowTaskLogInterval =
|
||||
std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
|
||||
|
@ -3603,7 +3605,19 @@ void* checkThread(void* arg) {
|
|||
|
||||
if (slowTask) {
|
||||
double t = timer();
|
||||
if (lastSlowTaskSignal == 0 || t - lastSlowTaskSignal >= slowTaskLogInterval) {
|
||||
bool newSlowTask = lastSlowTaskSignal == 0;
|
||||
|
||||
if (newSlowTask) {
|
||||
slowTaskStart = t;
|
||||
} else if (t - std::max(slowTaskStart, lastSlowTaskBlockedLog) > FLOW_KNOBS->SLOWTASK_BLOCKED_INTERVAL) {
|
||||
lastSlowTaskBlockedLog = t;
|
||||
// When this gets logged, it will be with a current timestamp (using timer()). If the network thread
|
||||
// unblocks, it will log any slow task related events at an earlier timestamp. That means the order of
|
||||
// events during this sequence will not match their timestamp order.
|
||||
TraceEvent(SevWarnAlways, "RunLoopBlocked").detail("Duration", t - slowTaskStart);
|
||||
}
|
||||
|
||||
if (newSlowTask || t - lastSlowTaskSignal >= slowTaskLogInterval) {
|
||||
if (lastSlowTaskSignal > 0) {
|
||||
slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL,
|
||||
FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
|
||||
|
@ -3614,6 +3628,7 @@ void* checkThread(void* arg) {
|
|||
pthread_kill(mainThread, SIGPROF);
|
||||
}
|
||||
} else {
|
||||
slowTaskStart = 0;
|
||||
lastSlowTaskSignal = 0;
|
||||
lastRunLoopIterations = currentRunLoopIterations;
|
||||
slowTaskLogInterval = minSlowTaskLogInterval;
|
||||
|
|
Loading…
Reference in New Issue