fix: tracking of the number of unhealthy servers was incorrect
fix: locality equality was only checking zoneId
This commit is contained in:
parent
90bf277206
commit
72c86e909e
|
@ -260,7 +260,7 @@ struct ServerStatus {
|
||||||
bool isUnhealthy() const { return isFailed || isUndesired; }
|
bool isUnhealthy() const { return isFailed || isUndesired; }
|
||||||
const char* toString() const { return isFailed ? "Failed" : isUndesired ? "Undesired" : "Healthy"; }
|
const char* toString() const { return isFailed ? "Failed" : isUndesired ? "Undesired" : "Healthy"; }
|
||||||
|
|
||||||
bool operator == (ServerStatus const& r) const { return isFailed == r.isFailed && isUndesired == r.isUndesired && isWrongConfiguration == r.isWrongConfiguration && locality.zoneId() == r.locality.zoneId(); }
|
bool operator == (ServerStatus const& r) const { return isFailed == r.isFailed && isUndesired == r.isUndesired && isWrongConfiguration == r.isWrongConfiguration && locality == r.locality; }
|
||||||
|
|
||||||
//If a process has reappeared without the storage server that was on it (isFailed == true), we don't need to exclude it
|
//If a process has reappeared without the storage server that was on it (isFailed == true), we don't need to exclude it
|
||||||
//We also don't need to exclude processes who are in the wrong configuration (since those servers will be removed)
|
//We also don't need to exclude processes who are in the wrong configuration (since those servers will be removed)
|
||||||
|
@ -305,13 +305,17 @@ ACTOR Future<Void> storageServerFailureTracker(
|
||||||
Version addedVersion )
|
Version addedVersion )
|
||||||
{
|
{
|
||||||
loop {
|
loop {
|
||||||
bool unhealthy = statusMap->count(server.id()) && statusMap->get(server.id()).isUnhealthy();
|
if( statusMap->count(server.id()) ) {
|
||||||
|
bool unhealthy = statusMap->get(server.id()).isUnhealthy();
|
||||||
if(unhealthy && !status->isUnhealthy()) {
|
if(unhealthy && !status->isUnhealthy()) {
|
||||||
(*unhealthyServers)--;
|
(*unhealthyServers)--;
|
||||||
}
|
}
|
||||||
if(!unhealthy && status->isUnhealthy()) {
|
if(!unhealthy && status->isUnhealthy()) {
|
||||||
(*unhealthyServers)++;
|
(*unhealthyServers)++;
|
||||||
}
|
}
|
||||||
|
} else if(status->isUnhealthy()) {
|
||||||
|
(*unhealthyServers)++;
|
||||||
|
}
|
||||||
|
|
||||||
statusMap->set( server.id(), *status );
|
statusMap->set( server.id(), *status );
|
||||||
if( status->isFailed )
|
if( status->isFailed )
|
||||||
|
@ -593,7 +597,7 @@ struct DDTeamCollection {
|
||||||
teamBuilder.cancel();
|
teamBuilder.cancel();
|
||||||
}
|
}
|
||||||
|
|
||||||
ACTOR Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection *self ) {
|
ACTOR static Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection *self ) {
|
||||||
Void _ = wait(signal);
|
Void _ = wait(signal);
|
||||||
Void _ = wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution));
|
Void _ = wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution));
|
||||||
|
|
||||||
|
@ -606,7 +610,7 @@ struct DDTeamCollection {
|
||||||
return Void();
|
return Void();
|
||||||
}
|
}
|
||||||
|
|
||||||
ACTOR Future<Void> checkBuildTeams( DDTeamCollection* self ) {
|
ACTOR static Future<Void> checkBuildTeams( DDTeamCollection* self ) {
|
||||||
state Promise<Void> restart;
|
state Promise<Void> restart;
|
||||||
|
|
||||||
Void _ = wait( self->checkTeamDelay );
|
Void _ = wait( self->checkTeamDelay );
|
||||||
|
@ -640,7 +644,7 @@ struct DDTeamCollection {
|
||||||
|
|
||||||
// SOMEDAY: Make bestTeam better about deciding to leave a shard where it is (e.g. in PRIORITY_TEAM_HEALTHY case)
|
// SOMEDAY: Make bestTeam better about deciding to leave a shard where it is (e.g. in PRIORITY_TEAM_HEALTHY case)
|
||||||
// use keys, src, dest, metrics, priority, system load, etc.. to decide...
|
// use keys, src, dest, metrics, priority, system load, etc.. to decide...
|
||||||
ACTOR Future<Void> getTeam( DDTeamCollection* self, GetTeamRequest req ) {
|
ACTOR static Future<Void> getTeam( DDTeamCollection* self, GetTeamRequest req ) {
|
||||||
try {
|
try {
|
||||||
Void _ = wait( self->checkBuildTeams( self ) );
|
Void _ = wait( self->checkBuildTeams( self ) );
|
||||||
|
|
||||||
|
@ -1242,6 +1246,10 @@ struct DDTeamCollection {
|
||||||
}
|
}
|
||||||
server_info.erase( removedServer );
|
server_info.erase( removedServer );
|
||||||
|
|
||||||
|
if(server_status.count(removedServer) && server_status.get(removedServer).isUnhealthy()) {
|
||||||
|
unhealthyServers--;
|
||||||
|
}
|
||||||
|
server_status.clear( removedServer );
|
||||||
// remove all teams that contain removedServer
|
// remove all teams that contain removedServer
|
||||||
// SOMEDAY: can we avoid walking through all teams, since we have an index of teams in which removedServer participated
|
// SOMEDAY: can we avoid walking through all teams, since we have an index of teams in which removedServer participated
|
||||||
for(int t=0; t<teams.size(); t++) {
|
for(int t=0; t<teams.size(); t++) {
|
||||||
|
|
Loading…
Reference in New Issue