From 1afef44f195f69c3dc621cd4c3879ba27e6f0d9d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 23 Nov 2021 15:26:38 -0800 Subject: [PATCH 1/3] fix: self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting the destination team or we could miss failure notifications for the storage servers in the destination team --- fdbserver/DataDistributionQueue.actor.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index d3ed343da5..fcbc73dbed 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1027,8 +1027,18 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. - std::pair>, bool> bestTeam = - wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req))); + Future>, bool>> fbestTeam = + brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)); + state bool bestTeamReady = fbestTeam.isReady(); + std::pair>, bool> bestTeam = wait(fbestTeam); + if (tciIndex > 0 && !bestTeamReady) { + // self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting + // the destination team or we could miss failure notifications for the storage servers in the + // destination team + TraceEvent("BestTeamNotReady"); + foundTeams = false; + break; + } // If a DC has no healthy team, we stop checking the other DCs until // the unhealthy DC is healthy again or is excluded. if (!bestTeam.first.present()) { From c9ee83e1b13e4e312f5d27257f73c9ec63da1555 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 24 Nov 2021 11:28:57 -0800 Subject: [PATCH 2/3] fix: do not buggify PEEK_TRACKER_EXPIRATION_TIME to a value of 20 --- fdbclient/ServerKnobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 80c3379adf..09ef6f191a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -63,7 +63,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_MESSAGE_SIZE, std::max(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type) init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 - init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120; + init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; init( PEEK_USING_STREAMING, true ); if( randomize && BUGGIFY ) PEEK_USING_STREAMING = false; init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); From 8fa7085c781d5e9a6eaf4ceea4c3f2a2cc067073 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 24 Nov 2021 11:40:41 -0800 Subject: [PATCH 3/3] added a comment --- fdbclient/ServerKnobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 09ef6f191a..3ed0b776c3 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -63,7 +63,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_MESSAGE_SIZE, std::max(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type) init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 - init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; + init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; // Cannot be buggified lower without changing the following assert in LogSystemPeekCursor.actor.cpp: ASSERT_WE_THINK(e.code() == error_code_operation_obsolete || SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10); init( PEEK_USING_STREAMING, true ); if( randomize && BUGGIFY ) PEEK_USING_STREAMING = false; init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 );