diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 61482e4f52..7b16177d44 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -715,7 +715,6 @@ struct DDTeamCollection : ReferenceCounted { } ACTOR static Future interruptableBuildTeams( DDTeamCollection* self ) { - TraceEvent("DDInterruptableBuildTeamsStart", self->distributorId); if(!self->addSubsetComplete.isSet()) { wait( addSubsetOfEmergencyTeams(self) ); self->addSubsetComplete.send(Void()); @@ -732,7 +731,6 @@ struct DDTeamCollection : ReferenceCounted { } ACTOR static Future checkBuildTeams( DDTeamCollection* self ) { - TraceEvent("DDCheckBuildTeamsStart", self->distributorId); wait( self->checkTeamDelay ); while( !self->teamBuilder.isReady() ) wait( self->teamBuilder ); @@ -758,7 +756,6 @@ struct DDTeamCollection : ReferenceCounted { // shardsAffectedByTeamFailure or we could be dropping a shard on the floor (since team // tracking is "edge triggered") // SOMEDAY: Account for capacity, load (when shardMetrics load is high) - // Q: How do we enforce the above statement? // self->teams.size() can be 0 under the ConfigureTest.txt test when we change configurations // The situation happens rarely. We may want to eliminate this situation someday @@ -2204,7 +2201,6 @@ struct DDTeamCollection : ReferenceCounted { .detail("CurrentTeamCount", teams.size()) .detail("ServerCount", server_info.size()) .detail("NonFailedServerCount", desiredServerSet.size()); - traceAllInfo(true); } bool shouldHandleServer(const StorageServerInterface &newServer) { @@ -2216,11 +2212,6 @@ struct DDTeamCollection : ReferenceCounted { void addServer( StorageServerInterface newServer, ProcessClass processClass, Promise errorOut, Version addedVersion ) { if (!shouldHandleServer(newServer)) { - TraceEvent("AddedStorageServer", distributorId) - .detail("ServerID", newServer.id()) - .detail("ShouldHandleServer", 0) - .detail("ServerDCId", newServer.locality.dcId()) - .detail("IncludedDCSize", includedDCs.size()); return; } allServers.push_back( newServer.id() ); @@ -2445,7 +2436,6 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent(SevInfo, "NoTeamsRemovedWhenServerRemoved") .detail("Primary", primary) .detail("Debug", "ThisShouldRarelyHappen_CheckInfoBelow"); - traceAllInfo(true); } // Step: Remove machine info related to removedServer @@ -2562,44 +2552,39 @@ bool existOtherHealthyTeams(DDTeamCollection* self, UID serverID) { ACTOR Future removeWrongStoreType(DDTeamCollection* self) { // Wait for storage servers to initialize its storeType - wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY) ); + wait( delay(SERVER_KNOBS->DD_REMOVE_STORE_ENGINE_DELAY) ); + state bool foundSSToRemove = false; - // TODO: How to reduce the amount of work when all SS have correct store type in most type? Maybe refer to badTeams remover approach loop { + foundSSToRemove = false; if (self->doRemoveWrongStoreType.get() == false) { // Once the wrong storeType SS picked to be removed is removed, doRemoveWrongStoreType will be set to true; // In case the SS fails in between, we should time out and check for the next SS. - wait(self->doRemoveWrongStoreType.onChange() || delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_TIMEOUT)); + wait(self->doRemoveWrongStoreType.onChange() || delay(SERVER_KNOBS->DD_REMOVE_STORE_ENGINE_TIMEOUT)); } - TraceEvent("WrongStoreTypeRemoverStartLoop", self->distributorId) - .detail("Primary", self->primary) - .detail("ServerInfoSize", self->server_info.size()) - .detail("SysRestoreType", self->configuration.storageServerStoreType); vector> initializingServers; for (auto& server : self->server_info) { NetworkAddress a = server.second->lastKnownInterface.address(); AddressExclusion addr(a.ip, a.port); TraceEvent("WrongStoreTypeRemover", self->distributorId) - .detail("DDID", self->distributorId) .detail("Server", server.first) .detail("Addr", addr.toString()) .detail("StoreType", server.second->storeType) - .detail("IsCorrectStoreType", - server.second->isCorrectStoreType(self->configuration.storageServerStoreType)); + .detail("ConfiguredStoreType", self->configuration.storageServerStoreType); //if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType) && existOtherHealthyTeams(self, server.first)) { if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType)) { server.second->wrongStoreTypeToRemove.set(true); + foundSSToRemove = true; break; } } self->doRemoveWrongStoreType.set(false); - // if (g_network->isSimulated()) { - // // Speed up removing wrong storeType server in simulation to avoid false positive test failure in consistency check - // wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY / 10) ); - // } else { - // wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY) ); - // } + if (!foundSSToRemove) { + break; + } } + + return Void(); } ACTOR Future machineTeamRemover(DDTeamCollection* self) { diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 3421e0b70b..9518ab04a4 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -196,9 +196,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0; init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0; - init( STR_NUM_SERVERS_REMOVED_ONCE, 1 ); if( randomize && BUGGIFY ) STR_NUM_SERVERS_REMOVED_ONCE = deterministicRandom()->random01() * 100.0; - init( STR_REMOVE_STORE_ENGINE_TIMEOUT, 60.0 ); if( randomize && BUGGIFY ) STR_REMOVE_STORE_ENGINE_TIMEOUT = deterministicRandom()->random01() * 60.0; - init( STR_REMOVE_STORE_ENGINE_DELAY, 60.0); if( randomize && BUGGIFY ) STR_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0; + init( DD_REMOVE_STORE_ENGINE_TIMEOUT, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_TIMEOUT = deterministicRandom()->random01() * 60.0; + init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0; // Redwood Storage Engine init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 8865edaccd..91374e9e86 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -155,11 +155,9 @@ public: double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to ensure it runs after machineTeamRemover - // WrongStoreTypeRemover to remove wrong storage engines - int STR_NUM_SERVERS_REMOVED_ONCE; // The number of servers with wrong storage engines to remove - double STR_REMOVE_STORE_ENGINE_TIMEOUT; // wait for at most timeout time before remove next batch of wrong stroage - // engines - double STR_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch + // Remove wrong storage engines + double DD_REMOVE_STORE_ENGINE_TIMEOUT; // wait for at most timeout time before remove next wrong stroage engine + double DD_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch double DD_FAILURE_TIME; double DD_ZERO_HEALTHY_TEAM_DELAY;