From 9e137d3b49c8fa1a4f504b4a69a37a37532d6d0c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 19 Dec 2019 18:26:29 -0800 Subject: [PATCH] fix: addPeerReference only marks a connection as healthy if it is the first peerReference added additional logging to long LoadBalance calls, and when the failure monitor state changes for an address --- fdbrpc/FailureMonitor.actor.cpp | 10 ++++++++-- fdbrpc/FlowTransport.actor.cpp | 6 ++++-- fdbrpc/LoadBalance.actor.h | 16 ++++++++++++++++ flow/Trace.h | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/fdbrpc/FailureMonitor.actor.cpp b/fdbrpc/FailureMonitor.actor.cpp index 0e6fa9d135..2c5a449e44 100644 --- a/fdbrpc/FailureMonitor.actor.cpp +++ b/fdbrpc/FailureMonitor.actor.cpp @@ -73,9 +73,9 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat // for an endpoint that is waited on changes, the waiter sees its failure status change auto it = addressStatus.find(address); - //TraceEvent("NotifyFailureStatus").detail("Address", address).detail("Status", status.failed ? "Failed" : "OK").detail("Present", it == addressStatus.end()); if (it == addressStatus.end()) { if (status != FailureStatus()) { + TraceEvent("NotifyAddressHealthy").suppressFor(1.0).detail("Address", address); addressStatus[address]=status; endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) ); } @@ -85,8 +85,14 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat it->second = status; else addressStatus.erase(it); - if(triggerEndpoint) + if(triggerEndpoint) { + if(status.failed) { + TraceEvent("NotifyAddressFailed").suppressFor(1.0).detail("Address", address); + } else { + TraceEvent("NotifyAddressHealthyPresent").suppressFor(1.0).detail("Address", address); + } endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) ); + } } } diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 260a27718f..ab93d8ae6e 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -1094,11 +1094,13 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) { void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) { if (!isStream || !endpoint.getPrimaryAddress().isValid()) return; - else if (FlowTransport::transport().isClient()) - IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false)); Reference peer = self->getOrOpenPeer(endpoint.getPrimaryAddress()); + if(peer->peerReferences == -1) { + if (FlowTransport::transport().isClient()) { + IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false)); + } peer->peerReferences = 1; } else { peer->peerReferences++; diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 0f8a216ddb..d6192209cc 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -185,6 +185,7 @@ Future< REPLY_TYPE(Request) > loadBalance( state Future secondDelay = Never(); state Promise requestFinished; + state double startTime = now(); setReplyPriority(request, taskID); if (!alternatives) @@ -278,6 +279,21 @@ Future< REPLY_TYPE(Request) > loadBalance( state double backoff = 0; state bool triedAllOptions = false; loop { + if(now() - startTime > g_network->isSimulated() ? 30.0 : 600.0) { + TraceEvent ev(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLong"); + ev.suppressFor(1.0); + ev.detail("Duration", now() - startTime); + ev.detail("NumAttempts", numAttempts); + ev.detail("Backoff", backoff); + ev.detail("TriedAllOptions", triedAllOptions); + if(ev.isEnabled()) { + for(int alternativeNum=0; alternativeNumsize(); alternativeNum++) { + RequestStream const* thisStream = &alternatives->get( alternativeNum, channel ); + TraceEvent(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLongEndpoint").detail("Addr", thisStream->getEndpoint().getPrimaryAddress()).detail("Token", thisStream->getEndpoint().token).detail("Failed", IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed); + } + } + } + // Find an alternative, if any, that is not failed, starting with nextAlt state RequestStream const* stream = NULL; for(int alternativeNum=0; alternativeNumsize(); alternativeNum++) { diff --git a/flow/Trace.h b/flow/Trace.h index da2d39fc55..a49d676df4 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -460,6 +460,10 @@ public: TraceEvent& GetLastError(); + bool isEnabled() const { + return enabled; + } + ~TraceEvent(); // Actually logs the event // Return the number of invocations of TraceEvent() at the specified logging level.