fdbrpc: Add warning when peer is unavailable for long time

This commit is contained in:
Vishesh Yadav 2020-01-08 13:31:58 -08:00
parent 6b8daeae6e
commit 598b2eaeb0
3 changed files with 15 additions and 0 deletions

View File

@ -403,6 +403,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
.detail("PeerAddr", self->destination)
.detail("ConnSet", (bool)conn);
state Optional<double> firstConnFailedTime = Optional<double>();
loop {
try {
if (!conn) { // Always, except for the first loop with an incoming connection
@ -464,6 +465,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
self->outgoingConnectionIdle = false;
}
firstConnFailedTime.reset();
try {
self->transport->countConnEstablished++;
wait( connectionWriter( self, conn ) || reader || connectionMonitor(self) );
@ -485,6 +487,17 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
} else {
self->reconnectionDelay = std::min(FLOW_KNOBS->MAX_RECONNECTION_TIME, self->reconnectionDelay * FLOW_KNOBS->RECONNECTION_TIME_GROWTH_RATE);
}
if (firstConnFailedTime.present()) {
if (now() - firstConnFailedTime.get() > FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT) {
TraceEvent(SevWarnAlways, "PeerUnavailableForLongTime", conn ? conn->getDebugID() : UID())
.detail("PeerAddr", self->destination);
firstConnFailedTime = now() - FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT/2.0;
}
} else {
firstConnFailedTime = now();
}
self->discardUnreliablePackets();
reader = Future<Void>();
bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||

View File

@ -72,6 +72,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
init( USE_OBJECT_SERIALIZER, 1 );
init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 );
init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 );
init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT, 3600.0 );
init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 );

View File

@ -106,6 +106,7 @@ public:
double PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION;
double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY;
int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT;
int PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT;
//AsyncFileEIO
int EIO_MAX_PARALLELISM;