Merge pull request #6047 from sfc-gh-etschannen/fix-best-team
Data distribution could miss storage server failures
This commit is contained in:
commit
6798d2972c
|
@ -63,7 +63,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( MAX_MESSAGE_SIZE, std::max<int>(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type)
|
||||
init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 );
|
||||
init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473
|
||||
init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 20 : 120;
|
||||
init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; // Cannot be buggified lower without changing the following assert in LogSystemPeekCursor.actor.cpp: ASSERT_WE_THINK(e.code() == error_code_operation_obsolete || SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
|
||||
init( PEEK_USING_STREAMING, true ); if( randomize && BUGGIFY ) PEEK_USING_STREAMING = false;
|
||||
init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
|
||||
init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 );
|
||||
|
|
|
@ -1027,8 +1027,18 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
|
|||
// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
|
||||
// server that hosts the relocateData. This is possible, for example, in a fearless configuration
|
||||
// when the remote DC is just brought up.
|
||||
std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam =
|
||||
wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
|
||||
Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
|
||||
brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
|
||||
state bool bestTeamReady = fbestTeam.isReady();
|
||||
std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
|
||||
if (tciIndex > 0 && !bestTeamReady) {
|
||||
// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting
|
||||
// the destination team or we could miss failure notifications for the storage servers in the
|
||||
// destination team
|
||||
TraceEvent("BestTeamNotReady");
|
||||
foundTeams = false;
|
||||
break;
|
||||
}
|
||||
// If a DC has no healthy team, we stop checking the other DCs until
|
||||
// the unhealthy DC is healthy again or is excluded.
|
||||
if (!bestTeam.first.present()) {
|
||||
|
|
Loading…
Reference in New Issue