foundationdb/fdbserver/TCInfo.actor.cpp

126 lines
4.9 KiB
C++
Raw Normal View History

2022-02-05 02:20:11 +08:00
#include "fdbclient/ServerKnobs.h"
#include "fdbserver/DDTeamCollection.h"
2022-02-05 02:59:01 +08:00
#include "fdbserver/TCInfo.h"
2022-02-05 02:20:11 +08:00
#include "flow/actorcompiler.h" // This must be the last #include.
class TCServerInfoImpl {
public:
ACTOR static Future<Void> updateServerMetrics(TCServerInfo* server) {
state StorageServerInterface ssi = server->lastKnownInterface;
state Future<ErrorOr<GetStorageMetricsReply>> metricsRequest =
ssi.getStorageMetrics.tryGetReply(GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch);
state Future<Void> resetRequest = Never();
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged(server->onInterfaceChanged);
state Future<Void> serverRemoved(server->onRemoved);
loop {
choose {
when(ErrorOr<GetStorageMetricsReply> rep = wait(metricsRequest)) {
if (rep.present()) {
server->serverMetrics = rep;
if (server->updated.canBeSet()) {
server->updated.send(Void());
}
break;
}
metricsRequest = Never();
resetRequest = delay(SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch);
}
when(std::pair<StorageServerInterface, ProcessClass> _ssi = wait(interfaceChanged)) {
ssi = _ssi.first;
interfaceChanged = server->onInterfaceChanged;
resetRequest = Void();
}
when(wait(serverRemoved)) { return Void(); }
when(wait(resetRequest)) { // To prevent a tight spin loop
if (IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) {
resetRequest = IFailureMonitor::failureMonitor().onStateEqual(
ssi.getStorageMetrics.getEndpoint(), FailureStatus(false));
} else {
resetRequest = Never();
metricsRequest = ssi.getStorageMetrics.tryGetReply(GetStorageMetricsRequest(),
TaskPriority::DataDistributionLaunch);
}
}
}
}
if (server->serverMetrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT) {
if (server->ssVersionTooFarBehind.get() == false) {
TraceEvent("StorageServerStuck", server->collection->distributorId)
.detail("ServerId", server->id.toString())
.detail("LastUpdate", server->serverMetrics.get().lastUpdate);
server->ssVersionTooFarBehind.set(true);
server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
} else if (server->serverMetrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG) {
if (server->ssVersionTooFarBehind.get() == false) {
TraceEvent(SevWarn, "SSVersionDiffLarge", server->collection->distributorId)
.detail("ServerId", server->id.toString())
.detail("VersionLag", server->serverMetrics.get().versionLag);
server->ssVersionTooFarBehind.set(true);
server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
} else if (server->serverMetrics.get().versionLag < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG) {
if (server->ssVersionTooFarBehind.get() == true) {
TraceEvent("SSVersionDiffNormal", server->collection->distributorId)
.detail("ServerId", server->id.toString())
.detail("VersionLag", server->serverMetrics.get().versionLag);
server->ssVersionTooFarBehind.set(false);
server->collection->removeLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
}
return Void();
}
ACTOR static Future<Void> updateServerMetrics(Reference<TCServerInfo> server) {
wait(updateServerMetrics(server.getPtr()));
return Void();
}
ACTOR static Future<Void> serverMetricsPolling(TCServerInfo* server) {
state double lastUpdate = now();
loop {
wait(updateServerMetrics(server));
wait(delayUntil(lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY +
SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(),
TaskPriority::DataDistributionLaunch));
lastUpdate = now();
}
}
};
2022-02-05 02:59:01 +08:00
class TCTeamInfoImpl {
public:
ACTOR static Future<Void> updateStorageMetrics(TCTeamInfo* self) {
std::vector<Future<Void>> updates;
updates.reserve(self->servers.size());
for (int i = 0; i < self->servers.size(); i++)
updates.push_back(TCServerInfo::updateServerMetrics(self->servers[i]));
wait(waitForAll(updates));
return Void();
}
};
2022-02-05 02:20:11 +08:00
Future<Void> TCServerInfo::updateServerMetrics() {
return TCServerInfoImpl::updateServerMetrics(this);
}
Future<Void> TCServerInfo::updateServerMetrics(Reference<TCServerInfo> server) {
return TCServerInfoImpl::updateServerMetrics(server);
}
Future<Void> TCServerInfo::serverMetricsPolling() {
return TCServerInfoImpl::serverMetricsPolling(this);
}
TCServerInfo::~TCServerInfo() {
if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss()) {
collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());
}
}
2022-02-05 02:59:01 +08:00
Future<Void> TCTeamInfo::updateStorageMetrics() {
return TCTeamInfoImpl::updateStorageMetrics(this);
}