fix: addPeerReference only marks a connection as healthy if it is the first peerReference

added additional logging to long LoadBalance calls, and when the failure monitor state changes for an address
This commit is contained in:
Evan Tschannen 2019-12-19 18:26:29 -08:00
parent 3c30215662
commit 9e137d3b49
4 changed files with 32 additions and 4 deletions

View File

@ -73,9 +73,9 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
// for an endpoint that is waited on changes, the waiter sees its failure status change
auto it = addressStatus.find(address);
//TraceEvent("NotifyFailureStatus").detail("Address", address).detail("Status", status.failed ? "Failed" : "OK").detail("Present", it == addressStatus.end());
if (it == addressStatus.end()) {
if (status != FailureStatus()) {
TraceEvent("NotifyAddressHealthy").suppressFor(1.0).detail("Address", address);
addressStatus[address]=status;
endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
}
@ -85,8 +85,14 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
it->second = status;
else
addressStatus.erase(it);
if(triggerEndpoint)
if(triggerEndpoint) {
if(status.failed) {
TraceEvent("NotifyAddressFailed").suppressFor(1.0).detail("Address", address);
} else {
TraceEvent("NotifyAddressHealthyPresent").suppressFor(1.0).detail("Address", address);
}
endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
}
}
}

View File

@ -1094,11 +1094,13 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) {
void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
if (!isStream || !endpoint.getPrimaryAddress().isValid())
return;
else if (FlowTransport::transport().isClient())
IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
if(peer->peerReferences == -1) {
if (FlowTransport::transport().isClient()) {
IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
}
peer->peerReferences = 1;
} else {
peer->peerReferences++;

View File

@ -185,6 +185,7 @@ Future< REPLY_TYPE(Request) > loadBalance(
state Future<Void> secondDelay = Never();
state Promise<Void> requestFinished;
state double startTime = now();
setReplyPriority(request, taskID);
if (!alternatives)
@ -278,6 +279,21 @@ Future< REPLY_TYPE(Request) > loadBalance(
state double backoff = 0;
state bool triedAllOptions = false;
loop {
if(now() - startTime > g_network->isSimulated() ? 30.0 : 600.0) {
TraceEvent ev(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLong");
ev.suppressFor(1.0);
ev.detail("Duration", now() - startTime);
ev.detail("NumAttempts", numAttempts);
ev.detail("Backoff", backoff);
ev.detail("TriedAllOptions", triedAllOptions);
if(ev.isEnabled()) {
for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
RequestStream<Request> const* thisStream = &alternatives->get( alternativeNum, channel );
TraceEvent(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLongEndpoint").detail("Addr", thisStream->getEndpoint().getPrimaryAddress()).detail("Token", thisStream->getEndpoint().token).detail("Failed", IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed);
}
}
}
// Find an alternative, if any, that is not failed, starting with nextAlt
state RequestStream<Request> const* stream = NULL;
for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {

View File

@ -460,6 +460,10 @@ public:
TraceEvent& GetLastError();
bool isEnabled() const {
return enabled;
}
~TraceEvent(); // Actually logs the event
// Return the number of invocations of TraceEvent() at the specified logging level.