StorageEngineSwitch:Stop removeWrongStoreType actor if no SS has wrong storeType

This commit is contained in:
Meng Xu 2019-08-16 16:11:28 -07:00
parent 2859dc57a8
commit 85ba904e2c
3 changed files with 16 additions and 34 deletions

View File

@ -715,7 +715,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
}
ACTOR static Future<Void> interruptableBuildTeams( DDTeamCollection* self ) {
TraceEvent("DDInterruptableBuildTeamsStart", self->distributorId);
if(!self->addSubsetComplete.isSet()) {
wait( addSubsetOfEmergencyTeams(self) );
self->addSubsetComplete.send(Void());
@ -732,7 +731,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
}
ACTOR static Future<Void> checkBuildTeams( DDTeamCollection* self ) {
TraceEvent("DDCheckBuildTeamsStart", self->distributorId);
wait( self->checkTeamDelay );
while( !self->teamBuilder.isReady() )
wait( self->teamBuilder );
@ -758,7 +756,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// shardsAffectedByTeamFailure or we could be dropping a shard on the floor (since team
// tracking is "edge triggered")
// SOMEDAY: Account for capacity, load (when shardMetrics load is high)
// Q: How do we enforce the above statement?
// self->teams.size() can be 0 under the ConfigureTest.txt test when we change configurations
// The situation happens rarely. We may want to eliminate this situation someday
@ -2204,7 +2201,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("CurrentTeamCount", teams.size())
.detail("ServerCount", server_info.size())
.detail("NonFailedServerCount", desiredServerSet.size());
traceAllInfo(true);
}
bool shouldHandleServer(const StorageServerInterface &newServer) {
@ -2216,11 +2212,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
void addServer( StorageServerInterface newServer, ProcessClass processClass, Promise<Void> errorOut, Version addedVersion ) {
if (!shouldHandleServer(newServer)) {
TraceEvent("AddedStorageServer", distributorId)
.detail("ServerID", newServer.id())
.detail("ShouldHandleServer", 0)
.detail("ServerDCId", newServer.locality.dcId())
.detail("IncludedDCSize", includedDCs.size());
return;
}
allServers.push_back( newServer.id() );
@ -2445,7 +2436,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
TraceEvent(SevInfo, "NoTeamsRemovedWhenServerRemoved")
.detail("Primary", primary)
.detail("Debug", "ThisShouldRarelyHappen_CheckInfoBelow");
traceAllInfo(true);
}
// Step: Remove machine info related to removedServer
@ -2562,46 +2552,41 @@ bool existOtherHealthyTeams(DDTeamCollection* self, UID serverID) {
ACTOR Future<Void> removeWrongStoreType(DDTeamCollection* self) {
// Wait for storage servers to initialize its storeType
wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY) );
wait( delay(SERVER_KNOBS->DD_REMOVE_STORE_ENGINE_DELAY) );
state bool foundSSToRemove = false;
// TODO: How to reduce the amount of work when all SS have correct store type in most type? Maybe refer to badTeams remover approach
loop {
foundSSToRemove = false;
if (self->doRemoveWrongStoreType.get() == false) {
// Once the wrong storeType SS picked to be removed is removed, doRemoveWrongStoreType will be set to true;
// In case the SS fails in between, we should time out and check for the next SS.
wait(self->doRemoveWrongStoreType.onChange() || delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_TIMEOUT));
wait(self->doRemoveWrongStoreType.onChange() || delay(SERVER_KNOBS->DD_REMOVE_STORE_ENGINE_TIMEOUT));
}
TraceEvent("WrongStoreTypeRemoverStartLoop", self->distributorId)
.detail("Primary", self->primary)
.detail("ServerInfoSize", self->server_info.size())
.detail("SysRestoreType", self->configuration.storageServerStoreType);
vector<Future<KeyValueStoreType>> initializingServers;
for (auto& server : self->server_info) {
NetworkAddress a = server.second->lastKnownInterface.address();
AddressExclusion addr(a.ip, a.port);
TraceEvent("WrongStoreTypeRemover", self->distributorId)
.detail("DDID", self->distributorId)
.detail("Server", server.first)
.detail("Addr", addr.toString())
.detail("StoreType", server.second->storeType)
.detail("IsCorrectStoreType",
server.second->isCorrectStoreType(self->configuration.storageServerStoreType));
.detail("ConfiguredStoreType", self->configuration.storageServerStoreType);
//if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType) && existOtherHealthyTeams(self, server.first)) {
if (!server.second->isCorrectStoreType(self->configuration.storageServerStoreType)) {
server.second->wrongStoreTypeToRemove.set(true);
foundSSToRemove = true;
break;
}
}
self->doRemoveWrongStoreType.set(false);
// if (g_network->isSimulated()) {
// // Speed up removing wrong storeType server in simulation to avoid false positive test failure in consistency check
// wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY / 10) );
// } else {
// wait( delay(SERVER_KNOBS->STR_REMOVE_STORE_ENGINE_DELAY) );
// }
if (!foundSSToRemove) {
break;
}
}
return Void();
}
ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
state int numMachineTeamRemoved = 0;
loop {

View File

@ -196,9 +196,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0;
init( STR_NUM_SERVERS_REMOVED_ONCE, 1 ); if( randomize && BUGGIFY ) STR_NUM_SERVERS_REMOVED_ONCE = deterministicRandom()->random01() * 100.0;
init( STR_REMOVE_STORE_ENGINE_TIMEOUT, 60.0 ); if( randomize && BUGGIFY ) STR_REMOVE_STORE_ENGINE_TIMEOUT = deterministicRandom()->random01() * 60.0;
init( STR_REMOVE_STORE_ENGINE_DELAY, 60.0); if( randomize && BUGGIFY ) STR_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0;
init( DD_REMOVE_STORE_ENGINE_TIMEOUT, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_TIMEOUT = deterministicRandom()->random01() * 60.0;
init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0;
// Redwood Storage Engine
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );

View File

@ -155,11 +155,9 @@ public:
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to ensure it runs after machineTeamRemover
// WrongStoreTypeRemover to remove wrong storage engines
int STR_NUM_SERVERS_REMOVED_ONCE; // The number of servers with wrong storage engines to remove
double STR_REMOVE_STORE_ENGINE_TIMEOUT; // wait for at most timeout time before remove next batch of wrong stroage
// engines
double STR_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch
// Remove wrong storage engines
double DD_REMOVE_STORE_ENGINE_TIMEOUT; // wait for at most timeout time before remove next wrong stroage engine
double DD_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch
double DD_FAILURE_TIME;
double DD_ZERO_HEALTHY_TEAM_DELAY;