diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 7bcdd50540..cc2e3b3720 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,6 +2,14 @@ Release Notes ############# +6.1.10 +===== + +Performance +----------- + +* Improved the recovery speed of storage servers with large amount of data. `(PR #1700) `_ + 6.1.9 ===== diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 46853d4627..b389115dda 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -414,8 +414,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_STORAGE_SERVER_WATCH_BYTES, 100e6 ); if( randomize && BUGGIFY ) MAX_STORAGE_SERVER_WATCH_BYTES = 10e3; init( MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE, 1e9 ); if( randomize && BUGGIFY ) MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE = 1e3; init( LONG_BYTE_SAMPLE_RECOVERY_DELAY, 60.0 ); - init( BYTE_SAMPLE_LOAD_PARALLELISM, 32 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1; + init( BYTE_SAMPLE_LOAD_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1; init( BYTE_SAMPLE_LOAD_DELAY, 0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1; + init( BYTE_SAMPLE_START_DELAY, 1.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.0; init( UPDATE_STORAGE_PROCESS_STATS_INTERVAL, 5.0 ); //Wait Failure diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 99728640f3..fedaa24e17 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -355,6 +355,7 @@ public: double LONG_BYTE_SAMPLE_RECOVERY_DELAY; int BYTE_SAMPLE_LOAD_PARALLELISM; double BYTE_SAMPLE_LOAD_DELAY; + double BYTE_SAMPLE_START_DELAY; double UPDATE_STORAGE_PROCESS_STATS_INTERVAL; //Wait Failure diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 9172511928..a1edf128a0 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -2958,11 +2958,12 @@ ACTOR Future applyByteSampleResult( StorageServer* data, IKeyValueStore* s return Void(); } -ACTOR Future restoreByteSample(StorageServer* data, IKeyValueStore* storage, Promise byteSampleSampleRecovered) { +ACTOR Future restoreByteSample(StorageServer* data, IKeyValueStore* storage, Promise byteSampleSampleRecovered, Future startRestore) { state std::vector>> byteSampleSample; wait( applyByteSampleResult(data, storage, persistByteSampleSampleKeys.begin, persistByteSampleSampleKeys.end, &byteSampleSample) ); byteSampleSampleRecovered.send(Void()); - wait( delay( BUGGIFY ? g_random->random01() * 2.0 : 0.0001 ) ); + wait( startRestore ); + wait( delay(SERVER_KNOBS->BYTE_SAMPLE_START_DELAY) ); size_t bytes_per_fetch = 0; // Since the expected size also includes (as of now) the space overhead of the container, we calculate our own number here @@ -3009,7 +3010,8 @@ ACTOR Future restoreDurableState( StorageServer* data, IKeyValueStore* sto state Future>> fShardAvailable = storage->readRange(persistShardAvailableKeys); state Promise byteSampleSampleRecovered; - data->byteSampleRecovery = restoreByteSample(data, storage, byteSampleSampleRecovered); + state Promise startByteSampleRestore; + data->byteSampleRecovery = restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture()); TraceEvent("ReadingDurableState", data->thisServerID); wait( waitForAll( (vector>>(), fFormat, fID, fVersion, fLogProtocol, fPrimaryLocality) ) ); @@ -3088,6 +3090,7 @@ ACTOR Future restoreDurableState( StorageServer* data, IKeyValueStore* sto } validate(data, true); + startByteSampleRestore.send(Void()); return true; }