2017-05-26 04:48:44 +08:00
|
|
|
/*
|
|
|
|
* FailureMonitor.actor.cpp
|
|
|
|
*
|
|
|
|
* This source file is part of the FoundationDB open source project
|
|
|
|
*
|
|
|
|
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2018-10-20 01:30:13 +08:00
|
|
|
#include "fdbrpc/FailureMonitor.h"
|
2020-01-10 06:26:05 +08:00
|
|
|
#include "flow/actorcompiler.h" // This must be the last #include.
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
ACTOR Future<Void> waitForStateEqual(IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status) {
|
2017-05-26 04:48:44 +08:00
|
|
|
loop {
|
|
|
|
Future<Void> change = monitor->onStateChanged(endpoint);
|
2020-01-10 06:26:05 +08:00
|
|
|
if (monitor->getState(endpoint) == status) return Void();
|
|
|
|
wait(change);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
ACTOR Future<Void> waitForContinuousFailure(IFailureMonitor* monitor, Endpoint endpoint,
|
|
|
|
double sustainedFailureDuration, double slope) {
|
2017-05-26 04:48:44 +08:00
|
|
|
state double startT = now();
|
2020-04-09 10:34:40 +08:00
|
|
|
|
2017-05-26 04:48:44 +08:00
|
|
|
loop {
|
2020-01-10 06:26:05 +08:00
|
|
|
wait(monitor->onFailed(endpoint));
|
|
|
|
if (monitor->permanentlyFailed(endpoint)) return Void();
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
// X == sustainedFailureDuration + slope * (now()-startT+X)
|
2020-01-10 06:26:05 +08:00
|
|
|
double waitDelay = (sustainedFailureDuration + slope * (now() - startT)) / (1 - slope);
|
2017-12-02 05:04:32 +08:00
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
// SOMEDAY: if we know that this process is a server or client we can tune this optimization better
|
|
|
|
if (waitDelay <
|
|
|
|
std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL,
|
|
|
|
FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) // We will not get a failure monitoring update in this amount
|
|
|
|
// of time, so there is no point in waiting for changes
|
2017-05-26 04:48:44 +08:00
|
|
|
waitDelay = 0;
|
|
|
|
choose {
|
2020-01-10 06:26:05 +08:00
|
|
|
when(wait(monitor->onStateEqual(endpoint, FailureStatus(false)))) {
|
|
|
|
} // SOMEDAY: Use onStateChanged() for efficiency
|
|
|
|
when(wait(delay(waitDelay))) { return Void(); }
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
Future<Void> IFailureMonitor::onStateEqual(Endpoint const& endpoint, FailureStatus status) {
|
|
|
|
if (status == getState(endpoint)) return Void();
|
2017-05-26 04:48:44 +08:00
|
|
|
return waitForStateEqual(this, endpoint, status);
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
Future<Void> IFailureMonitor::onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration, double slope) {
|
|
|
|
ASSERT(slope < 1.0);
|
|
|
|
return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2020-05-12 02:24:19 +08:00
|
|
|
SimpleFailureMonitor::SimpleFailureMonitor() : endpointKnownFailed() {
|
2020-05-12 03:53:19 +08:00
|
|
|
// Mark ourselves as available in FailureMonitor
|
2020-05-12 02:24:19 +08:00
|
|
|
const auto& localAddresses = FlowTransport::transport().getLocalAddresses();
|
|
|
|
addressStatus[localAddresses.address] = FailureStatus(false);
|
|
|
|
if (localAddresses.secondaryAddress.present()) {
|
|
|
|
addressStatus[localAddresses.secondaryAddress.get()] = FailureStatus(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
void SimpleFailureMonitor::setStatus(NetworkAddress const& address, FailureStatus const& status) {
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
// if (status.failed)
|
|
|
|
// printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(),
|
|
|
|
// address.toString().c_str()); printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(),
|
|
|
|
// address.toString(), status.failed ? "FAILED" : "OK", this); addressStatus.set( address, status );
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
// onStateChanged() will be waiting on endpointKnownFailed only where it is false, so if the address status
|
|
|
|
// for an endpoint that is waited on changes, the waiter sees its failure status change
|
2019-02-01 10:20:14 +08:00
|
|
|
auto it = addressStatus.find(address);
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2019-02-01 10:20:14 +08:00
|
|
|
if (it == addressStatus.end()) {
|
|
|
|
if (status != FailureStatus()) {
|
2019-12-20 10:26:29 +08:00
|
|
|
TraceEvent("NotifyAddressHealthy").suppressFor(1.0).detail("Address", address);
|
2019-02-01 10:20:14 +08:00
|
|
|
addressStatus[address]=status;
|
|
|
|
endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
2019-02-01 10:20:14 +08:00
|
|
|
} else {
|
2019-03-28 10:17:08 +08:00
|
|
|
bool triggerEndpoint = status != it->second;
|
2019-02-01 10:20:14 +08:00
|
|
|
if (status != FailureStatus())
|
2019-03-28 10:17:08 +08:00
|
|
|
it->second = status;
|
2019-02-01 10:20:14 +08:00
|
|
|
else
|
|
|
|
addressStatus.erase(it);
|
2019-12-20 10:26:29 +08:00
|
|
|
if(triggerEndpoint) {
|
|
|
|
if(status.failed) {
|
|
|
|
TraceEvent("NotifyAddressFailed").suppressFor(1.0).detail("Address", address);
|
|
|
|
} else {
|
|
|
|
TraceEvent("NotifyAddressHealthyPresent").suppressFor(1.0).detail("Address", address);
|
|
|
|
}
|
2019-02-01 10:20:14 +08:00
|
|
|
endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
|
2019-12-20 10:26:29 +08:00
|
|
|
}
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) {
|
2017-05-26 04:48:44 +08:00
|
|
|
// SOMEDAY: Expiration (this "leaks" memory)
|
2020-01-10 06:26:05 +08:00
|
|
|
if (endpoint.token.first() == -1) {
|
|
|
|
TraceEvent("WellKnownEndpointNotFound")
|
|
|
|
.suppressFor(1.0)
|
|
|
|
.detail("Address", endpoint.getPrimaryAddress())
|
|
|
|
.detail("TokenFirst", endpoint.token.first())
|
|
|
|
.detail("TokenSecond", endpoint.token.second());
|
2019-07-24 10:22:44 +08:00
|
|
|
return;
|
|
|
|
}
|
2020-01-10 06:26:05 +08:00
|
|
|
TraceEvent("EndpointNotFound")
|
|
|
|
.suppressFor(1.0)
|
|
|
|
.detail("Address", endpoint.getPrimaryAddress())
|
|
|
|
.detail("Token", endpoint.token);
|
2020-05-21 03:15:30 +08:00
|
|
|
failedEndpoints.insert(endpoint);
|
2020-05-21 03:30:26 +08:00
|
|
|
endpointKnownFailed.trigger(endpoint);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) {
|
2019-02-01 10:20:14 +08:00
|
|
|
//TraceEvent("NotifyDisconnect").detail("Address", address);
|
2020-01-10 06:26:05 +08:00
|
|
|
endpointKnownFailed.triggerRange(Endpoint({ address }, UID()), Endpoint({ address }, UID(-1, -1)));
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) {
|
2017-05-26 04:48:44 +08:00
|
|
|
// If the endpoint or address is already failed, return right away
|
2019-02-01 10:20:14 +08:00
|
|
|
auto i = addressStatus.find(endpoint.getPrimaryAddress());
|
2020-05-21 03:15:30 +08:00
|
|
|
if (i == addressStatus.end() || i->second.isFailed() || failedEndpoints.count(endpoint)) {
|
2018-10-31 04:44:37 +08:00
|
|
|
TraceEvent("AlreadyDisconnected").detail("Addr", endpoint.getPrimaryAddress()).detail("Tok", endpoint.token);
|
2017-05-26 04:48:44 +08:00
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return when the endpoint is triggered, which means that either the endpoint has become known failed, or the
|
2020-01-10 06:26:05 +08:00
|
|
|
// address has changed state (and since it was previously not failed, it must now be failed), or
|
|
|
|
// notifyDisconnect() has been called.
|
2017-05-26 04:48:44 +08:00
|
|
|
return endpointKnownFailed.onChange(endpoint);
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
Future<Void> SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) {
|
2017-05-26 04:48:44 +08:00
|
|
|
// Wait on endpointKnownFailed if it is false, to pick up both endpointNotFound errors (which set it to true)
|
|
|
|
// and changes to addressStatus (which trigger a range). Don't wait on endpointKnownFailed if it is true, because
|
|
|
|
// failure status for that endpoint can never change (and we could be spuriously triggered by setStatus)
|
|
|
|
// Also returns spuriously when notifyDisconnect is called (which doesn't actually change the state), but callers
|
|
|
|
// check the state so it's OK
|
2020-05-21 03:15:30 +08:00
|
|
|
if (failedEndpoints.count(endpoint))
|
2017-05-26 04:48:44 +08:00
|
|
|
return Never();
|
|
|
|
else
|
|
|
|
return endpointKnownFailed.onChange(endpoint);
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
FailureStatus SimpleFailureMonitor::getState(Endpoint const& endpoint) {
|
2020-05-21 03:15:30 +08:00
|
|
|
if (failedEndpoints.count(endpoint))
|
2017-05-26 04:48:44 +08:00
|
|
|
return FailureStatus(true);
|
|
|
|
else {
|
2018-10-31 04:44:37 +08:00
|
|
|
auto a = addressStatus.find(endpoint.getPrimaryAddress());
|
2020-01-10 06:26:05 +08:00
|
|
|
if (a == addressStatus.end())
|
|
|
|
return FailureStatus();
|
|
|
|
else
|
|
|
|
return a->second;
|
|
|
|
// printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(),
|
|
|
|
// a.failed ? "FAILED" : "OK", this);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
FailureStatus SimpleFailureMonitor::getState(NetworkAddress const& address) {
|
2019-04-05 05:11:12 +08:00
|
|
|
auto a = addressStatus.find(address);
|
2020-01-10 06:26:05 +08:00
|
|
|
if (a == addressStatus.end())
|
|
|
|
return FailureStatus();
|
|
|
|
else
|
|
|
|
return a->second;
|
2019-04-05 05:11:12 +08:00
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) {
|
2020-05-21 03:15:30 +08:00
|
|
|
if (!failedEndpoints.count(endpoint)) return false;
|
2019-02-01 10:20:14 +08:00
|
|
|
auto a = addressStatus.find(endpoint.getPrimaryAddress());
|
2020-01-10 06:26:05 +08:00
|
|
|
if (a == addressStatus.end())
|
|
|
|
return true;
|
|
|
|
else
|
|
|
|
return !a->second.failed;
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2020-01-10 06:26:05 +08:00
|
|
|
bool SimpleFailureMonitor::permanentlyFailed(Endpoint const& endpoint) {
|
2020-05-21 03:15:30 +08:00
|
|
|
return failedEndpoints.count(endpoint);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void SimpleFailureMonitor::reset() {
|
2020-01-10 06:26:05 +08:00
|
|
|
addressStatus = std::unordered_map<NetworkAddress, FailureStatus>();
|
2020-05-21 03:15:30 +08:00
|
|
|
failedEndpoints = std::unordered_set<Endpoint>();
|
2017-05-26 04:48:44 +08:00
|
|
|
endpointKnownFailed.resetNoWaiting();
|
|
|
|
}
|