relax perpetual wiggle pause condition; add trace log; correct perpetual wiggle priority setting
This commit is contained in:
parent
2abdbff11f
commit
501dc339a9
|
@ -255,7 +255,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
|
||||
init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5;
|
||||
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 1 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 10;
|
||||
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 50 );
|
||||
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 50 );
|
||||
|
||||
// TeamRemover
|
||||
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
|
|
|
@ -208,6 +208,7 @@ public:
|
|||
int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
|
||||
int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY;
|
||||
int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle
|
||||
int DD_STORAGE_WIGGLE_STUCK_THRESHOLD; // How many times bestTeamStuck accumulate will pause storage wiggle
|
||||
|
||||
// TeamRemover to remove redundant teams
|
||||
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
||||
|
|
|
@ -656,7 +656,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
int optimalTeamCount;
|
||||
AsyncVar<bool> zeroOptimalTeams;
|
||||
|
||||
bool bestTeamStuck = false;
|
||||
int bestTeamKeepStuckCount = 0;
|
||||
|
||||
bool isTssRecruiting; // If tss recruiting is waiting on a pair, don't consider DD recruiting for the purposes of QuietDB
|
||||
|
||||
|
@ -1011,12 +1011,12 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
|
||||
// Log BestTeamStuck reason when we have healthy teams but they do not have healthy free space
|
||||
if (randomTeams.empty() && !self->zeroHealthyTeams->get()) {
|
||||
self->bestTeamStuck = true;
|
||||
self->bestTeamKeepStuckCount++;
|
||||
if (g_network->isSimulated()) {
|
||||
TraceEvent(SevWarn, "GetTeamReturnEmpty").detail("HealthyTeams", self->healthyTeamCount);
|
||||
}
|
||||
} else {
|
||||
self->bestTeamStuck = false;
|
||||
self->bestTeamKeepStuckCount = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < randomTeams.size(); i++) {
|
||||
|
@ -2833,7 +2833,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
std::vector<Future<Void>> moveFutures;
|
||||
if (this->pid2server_info.count(pid) != 0) {
|
||||
for (auto& info : this->pid2server_info[pid]) {
|
||||
AddressExclusion addr(info->lastKnownInterface.address().ip);
|
||||
AddressExclusion addr(info->lastKnownInterface.address().ip, info->lastKnownInterface.address().port);
|
||||
if (this->excludedServers.count(addr) &&
|
||||
this->excludedServers.get(addr) != DDTeamCollection::Status::NONE) {
|
||||
continue; // don't overwrite the value set by actor trackExcludedServer
|
||||
|
@ -3509,7 +3509,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
bool anyUndesired = false;
|
||||
bool anyWrongConfiguration = false;
|
||||
bool anyWigglingServer = false;
|
||||
int serversLeft = 0;
|
||||
int serversLeft = 0, serverUndesired = 0, serverWrongConf = 0, serverWiggling = 0;
|
||||
|
||||
for (const UID& uid : team->getServerIDs()) {
|
||||
change.push_back(self->server_status.onChange(uid));
|
||||
|
@ -3519,12 +3519,15 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
}
|
||||
if (status.isUndesired) {
|
||||
anyUndesired = true;
|
||||
serverUndesired++;
|
||||
}
|
||||
if (status.isWrongConfiguration) {
|
||||
anyWrongConfiguration = true;
|
||||
serverWrongConf++;
|
||||
}
|
||||
if (status.isWiggling) {
|
||||
anyWigglingServer = true;
|
||||
serverWiggling++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3646,6 +3649,10 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_2_LEFT);
|
||||
else
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY);
|
||||
} else if (!badTeam && anyWigglingServer && serverWiggling == serverWrongConf &&
|
||||
serverWiggling == serverUndesired) {
|
||||
// the wrong configured and undesired server is the wiggling server
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE);
|
||||
} else if (badTeam || anyWrongConfiguration) {
|
||||
if (redundantTeam) {
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT);
|
||||
|
@ -3654,8 +3661,6 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
}
|
||||
} else if (anyUndesired) {
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER);
|
||||
} else if (anyWigglingServer) {
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE);
|
||||
} else {
|
||||
team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_HEALTHY);
|
||||
}
|
||||
|
@ -3972,7 +3977,7 @@ ACTOR Future<Void> perpetualStorageWiggleIterator(AsyncVar<bool>* stopSignal,
|
|||
wait(delayJittered(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY));
|
||||
// there must not have other teams to place wiggled data
|
||||
takeRest = teamCollection->server_info.size() <= teamCollection->configuration.storageTeamSize ||
|
||||
teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize;
|
||||
teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize;
|
||||
}
|
||||
wait(updateNextWigglingStoragePID(teamCollection));
|
||||
}
|
||||
|
@ -4020,10 +4025,12 @@ ACTOR Future<Void> clusterHealthCheckForPerpetualWiggle(DDTeamCollection* self,
|
|||
// b. healthy teams are not enough
|
||||
// c. the overall disk space is not enough
|
||||
if (count >= SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD || self->healthyTeamCount <= *extraTeamCount ||
|
||||
self->bestTeamStuck) {
|
||||
self->bestTeamKeepStuckCount > SERVER_KNOBS->DD_STORAGE_WIGGLE_STUCK_THRESHOLD) {
|
||||
// if we pause wiggle not because the reason a, increase extraTeamCount. This helps avoid oscillation
|
||||
// between pause and non-pause status.
|
||||
if ((self->healthyTeamCount <= *extraTeamCount || self->bestTeamStuck) && !self->pauseWiggle->get()) {
|
||||
if ((self->healthyTeamCount <= *extraTeamCount ||
|
||||
self->bestTeamKeepStuckCount > SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD) &&
|
||||
!self->pauseWiggle->get()) {
|
||||
*extraTeamCount = std::min(*extraTeamCount + pausePenalty, (int)self->teams.size());
|
||||
pausePenalty = std::min(pausePenalty * 2, (int)self->teams.size());
|
||||
}
|
||||
|
@ -4060,6 +4067,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
|
|||
self->includeStorageServersForWiggle();
|
||||
TraceEvent("PerpetualStorageWigglePause", self->distributorId)
|
||||
.detail("ProcessId", pid)
|
||||
.detail("BestTeamKeepStuckCount", self->bestTeamKeepStuckCount)
|
||||
.detail("ExtraHealthyTeamCount", extraTeamCount)
|
||||
.detail("HealthyTeamCount", self->healthyTeamCount)
|
||||
.detail("StorageCount", movingCount);
|
||||
|
@ -4566,6 +4574,10 @@ ACTOR Future<Void> storageServerTracker(
|
|||
DDTeamCollection::Status worstStatus = self->excludedServers.get(worstAddr);
|
||||
|
||||
if (worstStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(worstAddr, self, server)) {
|
||||
TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId)
|
||||
.detail("Address", worstAddr.toString())
|
||||
.detail("ProcessId", server->lastKnownInterface.locality.processId())
|
||||
.detail("ValidWigglingId", self->wigglingPid.present());
|
||||
self->excludedServers.set(worstAddr, DDTeamCollection::Status::NONE);
|
||||
worstStatus = DDTeamCollection::Status::NONE;
|
||||
}
|
||||
|
@ -4586,6 +4598,10 @@ ACTOR Future<Void> storageServerTracker(
|
|||
DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr);
|
||||
|
||||
if (testStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(testAddr, self, server)) {
|
||||
TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId)
|
||||
.detail("Address", testAddr.toString())
|
||||
.detail("ProcessId", server->lastKnownInterface.locality.processId())
|
||||
.detail("ValidWigglingId", self->wigglingPid.present());
|
||||
self->excludedServers.set(testAddr, DDTeamCollection::Status::NONE);
|
||||
testStatus = DDTeamCollection::Status::NONE;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue