Merge pull request #573 from etschannen/feature-remote-logs
Data distribution bug fixes
This commit is contained in:
commit
1c90e948f8
|
@ -1077,21 +1077,28 @@ struct DDTeamCollection {
|
|||
// If there are too few machines to even build teams or there are too few represented datacenters, build no new teams
|
||||
if( uniqueMachines >= self->configuration.storageTeamSize ) {
|
||||
desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER*serverCount;
|
||||
int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER*serverCount;
|
||||
|
||||
// Count only properly sized teams against the desired number of teams. This is to prevent "emergency" merged teams (see MoveKeys)
|
||||
// from overwhelming the team count (since we really did not want that team in the first place). These larger teams will not be
|
||||
// returned from getRandomTeam() (as used by bestTeam to find a new home for a shard).
|
||||
// Also exclude teams who have members in the wrong configuration, since we don't want these teams either
|
||||
int teamCount = 0;
|
||||
int totalTeamCount = 0;
|
||||
for(int i = 0; i < self->teams.size(); i++) {
|
||||
if( self->teams[i]->getServerIDs().size() == self->configuration.storageTeamSize && !self->teams[i]->isWrongConfiguration() ) {
|
||||
teamCount++;
|
||||
if( self->teams[i]->isHealthy() ) {
|
||||
teamCount++;
|
||||
}
|
||||
totalTeamCount++;
|
||||
}
|
||||
}
|
||||
|
||||
TraceEvent("BuildTeamsBegin", self->masterId).detail("DesiredTeams", desiredTeams).detail("UniqueMachines", uniqueMachines)
|
||||
.detail("TeamSize", self->configuration.storageTeamSize).detail("Servers", serverCount)
|
||||
.detail("CurrentTrackedTeams", self->teams.size()).detail("TeamCount", teamCount);
|
||||
TraceEvent("BuildTeamsBegin", self->masterId).detail("DesiredTeams", desiredTeams).detail("MaxTeams", maxTeams)
|
||||
.detail("UniqueMachines", uniqueMachines).detail("TeamSize", self->configuration.storageTeamSize).detail("Servers", serverCount)
|
||||
.detail("CurrentTrackedTeams", self->teams.size()).detail("HealthyTeamCount", teamCount).detail("TotalTeamCount", totalTeamCount);
|
||||
|
||||
teamCount = std::max(teamCount, desiredTeams + totalTeamCount - maxTeams );
|
||||
|
||||
if( desiredTeams > teamCount ) {
|
||||
std::set<UID> desiredServerSet;
|
||||
|
|
|
@ -921,18 +921,12 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
|||
bestTeams.push_back(std::make_pair(bestTeam.get(), foundSource));
|
||||
tciIndex++;
|
||||
}
|
||||
if (foundTeams) {
|
||||
if (foundTeams && anyHealthy) {
|
||||
break;
|
||||
}
|
||||
TEST(true); //did not find a healthy destination team on the first attempt
|
||||
stuckCount++;
|
||||
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).detail("Count", stuckCount).suppressFor(1.0);
|
||||
if(stuckCount > 50 && g_network->isSimulated()) { //FIXME: known bug in simulation we are supressing
|
||||
int unseed = noUnseed ? 0 : g_random->randomInt(0, 100001);
|
||||
TraceEvent("ElapsedTime").detail("SimTime", now()).detail("RealTime", 0)
|
||||
.detail("RandomUnseed", unseed);
|
||||
flushAndExit(0);
|
||||
}
|
||||
Void _ = wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) );
|
||||
}
|
||||
|
||||
|
@ -976,7 +970,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
|||
state Error error = success();
|
||||
state Promise<Void> dataMovementComplete;
|
||||
state Future<Void> doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->recoveryVersion, self->teamCollections.size() > 1, relocateShardInterval.pairID );
|
||||
state Future<Void> pollHealth = (!anyHealthy || signalledTransferComplete) ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
|
||||
state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
|
||||
try {
|
||||
loop {
|
||||
choose {
|
||||
|
|
|
@ -148,6 +148,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( FREE_SPACE_RATIO_CUTOFF, 0.1 );
|
||||
init( FREE_SPACE_RATIO_DD_CUTOFF, 0.2 );
|
||||
init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = 1;
|
||||
init( MAX_TEAMS_PER_SERVER, 3*DESIRED_TEAMS_PER_SERVER );
|
||||
init( DD_SHARD_SIZE_GRANULARITY, 5000000 );
|
||||
init( DD_SHARD_SIZE_GRANULARITY_SIM, 500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
|
||||
init( DD_MOVE_KEYS_PARALLELISM, 20 ); if( randomize && BUGGIFY ) DD_MOVE_KEYS_PARALLELISM = 1;
|
||||
|
@ -252,7 +253,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( MIN_BALANCE_DIFFERENCE, 10000 );
|
||||
init( SECONDS_BEFORE_NO_FAILURE_DELAY, 8 * 3600 );
|
||||
init( MAX_TXS_SEND_MEMORY, 1e7 ); if( randomize && BUGGIFY ) MAX_TXS_SEND_MEMORY = 1e5;
|
||||
init( MAX_RECOVERY_VERSIONS, 200 * VERSIONS_PER_SECOND ); if( randomize && BUGGIFY ) MAX_RECOVERY_VERSIONS = VERSIONS_PER_SECOND;
|
||||
init( MAX_RECOVERY_VERSIONS, 200 * VERSIONS_PER_SECOND );
|
||||
init( MAX_RECOVERY_TIME, 20.0 ); if( randomize && BUGGIFY ) MAX_RECOVERY_TIME = 1.0;
|
||||
|
||||
// Resolver
|
||||
|
|
|
@ -112,6 +112,7 @@ public:
|
|||
double FREE_SPACE_RATIO_CUTOFF;
|
||||
double FREE_SPACE_RATIO_DD_CUTOFF;
|
||||
int DESIRED_TEAMS_PER_SERVER;
|
||||
int MAX_TEAMS_PER_SERVER;
|
||||
int64_t DD_SHARD_SIZE_GRANULARITY;
|
||||
int64_t DD_SHARD_SIZE_GRANULARITY_SIM;
|
||||
int DD_MOVE_KEYS_PARALLELISM;
|
||||
|
|
Loading…
Reference in New Issue