diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index cd2ffd6105..d052ae8695 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -595,6 +595,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( FASTRESTORE_REQBATCH_PARALLEL, 50 ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_PARALLEL = deterministicRandom()->random01() * 100 + 1; } init( FASTRESTORE_REQBATCH_LOG, false ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_LOG = deterministicRandom()->random01() < 0.2 ? true : false; } init( FASTRESTORE_TXN_CLEAR_MAX, 1000 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_CLEAR_MAX = deterministicRandom()->random01() * 100 + 1; } + init( FASTRESTORE_TXN_RETRY_MAX, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_RETRY_MAX = deterministicRandom()->random01() * 100 + 1; } // clang-format on diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 7f21a76eee..147446153a 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -533,6 +533,7 @@ public: int64_t FASTRESTORE_REQBATCH_PARALLEL; // number of requests to wait on for getBatchReplies() bool FASTRESTORE_REQBATCH_LOG; // verbose log information for getReplyBatches int FASTRESTORE_TXN_CLEAR_MAX; // threshold to start tracking each clear op in a txn + int FASTRESTORE_TXN_RETRY_MAX; // threshold to start output error on too many retries ServerKnobs(); void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false); diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 281da2a636..ccedec991f 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -170,7 +170,7 @@ ACTOR static Future handleSendMutationVectorRequest(RestoreSendVersionedMu ACTOR static Future applyClearRangeMutations(Standalone> ranges, double delayTime, Database cx, UID applierID, int batchIndex) { state Reference tr(new ReadYourWritesTransaction(cx)); - state int count = 0; + state int retries = 0; state double numOps = 0; wait(delay(delayTime + deterministicRandom()->random01() * delayTime)); TraceEvent("FastRestoreApplierClearRangeMutationsStart", applierID) @@ -198,8 +198,8 @@ ACTOR static Future applyClearRangeMutations(Standalonecommit()); break; } catch (Error& e) { - count++; - if (count > 100) { + retries++; + if (retries > SERVER_KNOBS->FASTRESTORE_TXN_RETRY_MAX) { TraceEvent(SevWarnAlways, "RestoreApplierApplyClearRangeMutationsStuck", applierID) .detail("BatchIndex", batchIndex) .detail("ClearRanges", ranges.size()) @@ -263,16 +263,14 @@ ACTOR static Future getAndComputeStagingKeys( .detail("PendingMutation", vm.second.toString()); } key.second->second.precomputeResult("GetAndComputeStagingKeysNoBaseValueInDB", applierID, batchIndex); - i++; - continue; } else { // The key's version ideally should be the most recently committed version. // But as long as it is > 1 and less than the start version of the version batch, it is the same result. MutationRef m(MutationRef::SetValue, key.first, fValues[i].get().get()); key.second->second.add(m, LogMessageVersion(1)); key.second->second.precomputeResult("GetAndComputeStagingKeys", applierID, batchIndex); - i++; } + i++; } TraceEvent("FastRestoreApplierGetAndComputeStagingKeysDone", applierID) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index fd0c6844be..40255c3c60 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1868,7 +1868,7 @@ int main(int argc, char* argv[]) { vector> actors(listenErrors.begin(), listenErrors.end()); actors.push_back(restoreWorker(opts.connectionFile, opts.localities, dataFolder)); f = stopAfter(waitForAll(actors)); - printf("Fast restore worker exits\n"); + printf("Fast restore worker started\n"); g_network->run(); printf("g_network->run() done\n"); } else { // Call fdbd roles in conventional way