Merge pull request #1907 from etschannen/master
A number of bug fixes for rare problems found by correctness testing
This commit is contained in:
commit
a0f26b604c
|
@ -540,6 +540,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
DatabaseConfiguration configuration;
|
||||
|
||||
bool doBuildTeams;
|
||||
bool lastBuildTeamsFailed;
|
||||
Future<Void> teamBuilder;
|
||||
AsyncTrigger restartTeamBuilder;
|
||||
|
||||
|
@ -626,7 +627,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
|
||||
Reference<AsyncVar<bool>> processingUnhealthy)
|
||||
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
||||
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()),
|
||||
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()),
|
||||
badTeamRemover(Void()), redundantMachineTeamRemover(Void()), redundantServerTeamRemover(Void()),
|
||||
configuration(configuration), readyToStart(readyToStart), clearHealthyZoneFuture(Void()),
|
||||
checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)),
|
||||
|
@ -1449,6 +1450,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
|
||||
.detail("Primary", primary)
|
||||
.detail("Reason", "Unable to make desired machine Teams");
|
||||
lastBuildTeamsFailed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1874,6 +1876,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
|
||||
if (bestServerTeam.size() != configuration.storageTeamSize) {
|
||||
// Not find any team and will unlikely find a team
|
||||
lastBuildTeamsFailed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -2018,7 +2021,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
.detail("MachineTeamCount", self->machineTeams.size())
|
||||
.detail("MachineCount", self->machine_info.size())
|
||||
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER);
|
||||
|
||||
|
||||
self->lastBuildTeamsFailed = false;
|
||||
if (teamsToBuild > 0 || self->notEnoughTeamsForAServer()) {
|
||||
state vector<std::vector<UID>> builtTeams;
|
||||
|
||||
|
@ -3099,7 +3103,7 @@ ACTOR Future<Void> storageServerFailureTracker(
|
|||
choose {
|
||||
when ( wait(healthChanged) ) {
|
||||
status->isFailed = !status->isFailed;
|
||||
if(!status->isFailed && !server->teams.size()) {
|
||||
if(!status->isFailed && (server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || self->lastBuildTeamsFailed)) {
|
||||
self->doBuildTeams = true;
|
||||
}
|
||||
if(status->isFailed && self->healthyZone.get().present() && self->clearHealthyZoneFuture.isReady()) {
|
||||
|
@ -3221,7 +3225,7 @@ ACTOR Future<Void> storageServerTracker(
|
|||
self->restartRecruiting.trigger();
|
||||
|
||||
if (lastIsUnhealthy && !status.isUnhealthy() &&
|
||||
server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) {
|
||||
( server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || self->lastBuildTeamsFailed)) {
|
||||
self->doBuildTeams = true;
|
||||
self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
|
||||
}
|
||||
|
|
|
@ -246,7 +246,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( SPRING_CLEANING_MAX_VACUUM_PAGES, 1e9 ); if( randomize && BUGGIFY ) SPRING_CLEANING_MAX_VACUUM_PAGES = deterministicRandom()->coinflip() ? 0 : deterministicRandom()->randomInt(1, 1e4);
|
||||
|
||||
// KeyValueStoreMemory
|
||||
init( REPLACE_CONTENTS_BYTES, 1e5 ); if( randomize && BUGGIFY ) REPLACE_CONTENTS_BYTES = 1e3;
|
||||
init( REPLACE_CONTENTS_BYTES, 1e5 );
|
||||
|
||||
// Leader election
|
||||
bool longLeaderElection = randomize && BUGGIFY;
|
||||
|
|
|
@ -1547,6 +1547,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
}
|
||||
if(maxRecoveryIndex > 0) {
|
||||
logServers = oldLogData[maxRecoveryIndex-1].tLogs;
|
||||
prevState.txsTags = oldLogData[maxRecoveryIndex-1].txsTags;
|
||||
lockResults[0] = allLockResults[maxRecoveryIndex];
|
||||
lockResults[0].isCurrent = true;
|
||||
|
||||
|
|
Loading…
Reference in New Issue