From 598b2eaeb054155972b7b9faac71de98740cf0ce Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 8 Jan 2020 13:31:58 -0800 Subject: [PATCH] fdbrpc: Add warning when peer is unavailable for long time --- fdbrpc/FlowTransport.actor.cpp | 13 +++++++++++++ flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + 3 files changed, 15 insertions(+) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 8d4b8025be..8d66fa667f 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -403,6 +403,7 @@ ACTOR Future connectionKeeper( Reference self, .detail("PeerAddr", self->destination) .detail("ConnSet", (bool)conn); + state Optional firstConnFailedTime = Optional(); loop { try { if (!conn) { // Always, except for the first loop with an incoming connection @@ -464,6 +465,7 @@ ACTOR Future connectionKeeper( Reference self, self->outgoingConnectionIdle = false; } + firstConnFailedTime.reset(); try { self->transport->countConnEstablished++; wait( connectionWriter( self, conn ) || reader || connectionMonitor(self) ); @@ -485,6 +487,17 @@ ACTOR Future connectionKeeper( Reference self, } else { self->reconnectionDelay = std::min(FLOW_KNOBS->MAX_RECONNECTION_TIME, self->reconnectionDelay * FLOW_KNOBS->RECONNECTION_TIME_GROWTH_RATE); } + + if (firstConnFailedTime.present()) { + if (now() - firstConnFailedTime.get() > FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT) { + TraceEvent(SevWarnAlways, "PeerUnavailableForLongTime", conn ? conn->getDebugID() : UID()) + .detail("PeerAddr", self->destination); + firstConnFailedTime = now() - FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT/2.0; + } + } else { + firstConnFailedTime = now(); + } + self->discardUnreliablePackets(); reader = Future(); bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled || diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index d1510817e4..bfc75361c8 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -72,6 +72,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( USE_OBJECT_SERIALIZER, 1 ); init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 ); init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 ); + init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT, 3600.0 ); init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 333c1c1222..923ae83548 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -106,6 +106,7 @@ public: double PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION; double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY; int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT; + int PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT; //AsyncFileEIO int EIO_MAX_PARALLELISM;