occasionally throw wrong_shard_server when waitMetrics times out so that the waitMetrics request can get the correct number of shards if two shards have been merged or split and the same storage server owns all the chunks

2020-01-15 13:22:18 -08:00 · 2020-01-15 13:22:18 -08:00 · 4b90487b90
parent fd5705a451
commit 4b90487b90
3 changed files with 14 additions and 3 deletions
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -476,6 +476,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
 	init( BEHIND_CHECK_DELAY,                                    2.0 );
 	init( BEHIND_CHECK_COUNT,                                      2 );
 	init( BEHIND_CHECK_VERSIONS,             5 * VERSIONS_PER_SECOND );
+	init( WAIT_METRICS_WRONG_SHARD_CHANCE,                       0.1 );

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -419,6 +419,7 @@ public:
 	double BEHIND_CHECK_DELAY;
 	int BEHIND_CHECK_COUNT;
 	int64_t BEHIND_CHECK_VERSIONS;
+	double WAIT_METRICS_WRONG_SHARD_CHANCE;

 	//Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -3359,9 +3359,18 @@ ACTOR Future<Void> waitMetrics( StorageServerMetrics* self, WaitMetricsRequest r
 				break;
 			}

-			if ( timedout || !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
-				TEST( !timedout ); // ShardWaitMetrics return case 2 (delayed)
-				TEST( timedout ); // ShardWaitMetrics return on timeout
+			if( timedout ) {
+				TEST( true ); // ShardWaitMetrics return on timeout
+				if(deterministicRandom()->random01() < SERVER_KNOBS->WAIT_METRICS_WRONG_SHARD_CHANCE) {
+					req.reply.sendError( wrong_shard_server() );
+				} else {
+					req.reply.send( metrics );
+				}
+				break;
+			}
+
+			if ( !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
+				TEST( true ); // ShardWaitMetrics return case 2 (delayed)
 				req.reply.send( metrics );
 				break;
 			}