Merge pull request #1907 from etschannen/master

A number of bug fixes for rare problems found by correctness testing
This commit is contained in:
Evan Tschannen 2019-07-29 21:04:38 -07:00 committed by GitHub
commit a0f26b604c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 10 additions and 5 deletions

View File

@ -540,6 +540,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
DatabaseConfiguration configuration;
bool doBuildTeams;
bool lastBuildTeamsFailed;
Future<Void> teamBuilder;
AsyncTrigger restartTeamBuilder;
@ -626,7 +627,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
Reference<AsyncVar<bool>> processingUnhealthy)
: cx(cx), distributorId(distributorId), lock(lock), output(output),
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()),
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()),
badTeamRemover(Void()), redundantMachineTeamRemover(Void()), redundantServerTeamRemover(Void()),
configuration(configuration), readyToStart(readyToStart), clearHealthyZoneFuture(Void()),
checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)),
@ -1449,6 +1450,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
.detail("Primary", primary)
.detail("Reason", "Unable to make desired machine Teams");
lastBuildTeamsFailed = true;
break;
}
}
@ -1874,6 +1876,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
if (bestServerTeam.size() != configuration.storageTeamSize) {
// Not find any team and will unlikely find a team
lastBuildTeamsFailed = true;
break;
}
@ -2018,7 +2021,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("MachineTeamCount", self->machineTeams.size())
.detail("MachineCount", self->machine_info.size())
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER);
self->lastBuildTeamsFailed = false;
if (teamsToBuild > 0 || self->notEnoughTeamsForAServer()) {
state vector<std::vector<UID>> builtTeams;
@ -3099,7 +3103,7 @@ ACTOR Future<Void> storageServerFailureTracker(
choose {
when ( wait(healthChanged) ) {
status->isFailed = !status->isFailed;
if(!status->isFailed && !server->teams.size()) {
if(!status->isFailed && (server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || self->lastBuildTeamsFailed)) {
self->doBuildTeams = true;
}
if(status->isFailed && self->healthyZone.get().present() && self->clearHealthyZoneFuture.isReady()) {
@ -3221,7 +3225,7 @@ ACTOR Future<Void> storageServerTracker(
self->restartRecruiting.trigger();
if (lastIsUnhealthy && !status.isUnhealthy() &&
server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) {
( server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || self->lastBuildTeamsFailed)) {
self->doBuildTeams = true;
self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
}

View File

@ -246,7 +246,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( SPRING_CLEANING_MAX_VACUUM_PAGES, 1e9 ); if( randomize && BUGGIFY ) SPRING_CLEANING_MAX_VACUUM_PAGES = deterministicRandom()->coinflip() ? 0 : deterministicRandom()->randomInt(1, 1e4);
// KeyValueStoreMemory
init( REPLACE_CONTENTS_BYTES, 1e5 ); if( randomize && BUGGIFY ) REPLACE_CONTENTS_BYTES = 1e3;
init( REPLACE_CONTENTS_BYTES, 1e5 );
// Leader election
bool longLeaderElection = randomize && BUGGIFY;

View File

@ -1547,6 +1547,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
if(maxRecoveryIndex > 0) {
logServers = oldLogData[maxRecoveryIndex-1].tLogs;
prevState.txsTags = oldLogData[maxRecoveryIndex-1].txsTags;
lockResults[0] = allLockResults[maxRecoveryIndex];
lockResults[0].isCurrent = true;