Merge pull request #573 from etschannen/feature-remote-logs

Data distribution bug fixes
This commit is contained in:
Evan Tschannen 2018-07-07 17:43:11 -07:00 committed by GitHub
commit 1c90e948f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 16 additions and 13 deletions

View File

@ -1077,21 +1077,28 @@ struct DDTeamCollection {
// If there are too few machines to even build teams or there are too few represented datacenters, build no new teams
if( uniqueMachines >= self->configuration.storageTeamSize ) {
desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER*serverCount;
int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER*serverCount;
// Count only properly sized teams against the desired number of teams. This is to prevent "emergency" merged teams (see MoveKeys)
// from overwhelming the team count (since we really did not want that team in the first place). These larger teams will not be
// returned from getRandomTeam() (as used by bestTeam to find a new home for a shard).
// Also exclude teams who have members in the wrong configuration, since we don't want these teams either
int teamCount = 0;
int totalTeamCount = 0;
for(int i = 0; i < self->teams.size(); i++) {
if( self->teams[i]->getServerIDs().size() == self->configuration.storageTeamSize && !self->teams[i]->isWrongConfiguration() ) {
teamCount++;
if( self->teams[i]->isHealthy() ) {
teamCount++;
}
totalTeamCount++;
}
}
TraceEvent("BuildTeamsBegin", self->masterId).detail("DesiredTeams", desiredTeams).detail("UniqueMachines", uniqueMachines)
.detail("TeamSize", self->configuration.storageTeamSize).detail("Servers", serverCount)
.detail("CurrentTrackedTeams", self->teams.size()).detail("TeamCount", teamCount);
TraceEvent("BuildTeamsBegin", self->masterId).detail("DesiredTeams", desiredTeams).detail("MaxTeams", maxTeams)
.detail("UniqueMachines", uniqueMachines).detail("TeamSize", self->configuration.storageTeamSize).detail("Servers", serverCount)
.detail("CurrentTrackedTeams", self->teams.size()).detail("HealthyTeamCount", teamCount).detail("TotalTeamCount", totalTeamCount);
teamCount = std::max(teamCount, desiredTeams + totalTeamCount - maxTeams );
if( desiredTeams > teamCount ) {
std::set<UID> desiredServerSet;

View File

@ -921,18 +921,12 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
bestTeams.push_back(std::make_pair(bestTeam.get(), foundSource));
tciIndex++;
}
if (foundTeams) {
if (foundTeams && anyHealthy) {
break;
}
TEST(true); //did not find a healthy destination team on the first attempt
stuckCount++;
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).detail("Count", stuckCount).suppressFor(1.0);
if(stuckCount > 50 && g_network->isSimulated()) { //FIXME: known bug in simulation we are supressing
int unseed = noUnseed ? 0 : g_random->randomInt(0, 100001);
TraceEvent("ElapsedTime").detail("SimTime", now()).detail("RealTime", 0)
.detail("RandomUnseed", unseed);
flushAndExit(0);
}
Void _ = wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) );
}
@ -976,7 +970,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
state Error error = success();
state Promise<Void> dataMovementComplete;
state Future<Void> doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->recoveryVersion, self->teamCollections.size() > 1, relocateShardInterval.pairID );
state Future<Void> pollHealth = (!anyHealthy || signalledTransferComplete) ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
try {
loop {
choose {

View File

@ -148,6 +148,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( FREE_SPACE_RATIO_CUTOFF, 0.1 );
init( FREE_SPACE_RATIO_DD_CUTOFF, 0.2 );
init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = 1;
init( MAX_TEAMS_PER_SERVER, 3*DESIRED_TEAMS_PER_SERVER );
init( DD_SHARD_SIZE_GRANULARITY, 5000000 );
init( DD_SHARD_SIZE_GRANULARITY_SIM, 500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
init( DD_MOVE_KEYS_PARALLELISM, 20 ); if( randomize && BUGGIFY ) DD_MOVE_KEYS_PARALLELISM = 1;
@ -252,7 +253,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( MIN_BALANCE_DIFFERENCE, 10000 );
init( SECONDS_BEFORE_NO_FAILURE_DELAY, 8 * 3600 );
init( MAX_TXS_SEND_MEMORY, 1e7 ); if( randomize && BUGGIFY ) MAX_TXS_SEND_MEMORY = 1e5;
init( MAX_RECOVERY_VERSIONS, 200 * VERSIONS_PER_SECOND ); if( randomize && BUGGIFY ) MAX_RECOVERY_VERSIONS = VERSIONS_PER_SECOND;
init( MAX_RECOVERY_VERSIONS, 200 * VERSIONS_PER_SECOND );
init( MAX_RECOVERY_TIME, 20.0 ); if( randomize && BUGGIFY ) MAX_RECOVERY_TIME = 1.0;
// Resolver

View File

@ -112,6 +112,7 @@ public:
double FREE_SPACE_RATIO_CUTOFF;
double FREE_SPACE_RATIO_DD_CUTOFF;
int DESIRED_TEAMS_PER_SERVER;
int MAX_TEAMS_PER_SERVER;
int64_t DD_SHARD_SIZE_GRANULARITY;
int64_t DD_SHARD_SIZE_GRANULARITY_SIM;
int DD_MOVE_KEYS_PARALLELISM;