FastRestore:Fix various bugs discovered by enhanced simulation

1. sendMutation request can be dispatched when the version batch has finished and its data has been deleted;
2. Request scheduler on loader may get into infinite loop because FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE knob can be set to 0 in simulation
This commit is contained in:
Meng Xu 2020-08-26 15:21:33 -07:00
parent f1bd2a18ed
commit 7a29a3157f
2 changed files with 42 additions and 19 deletions

View File

@ -646,9 +646,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( FASTRESTORE_SCHED_UPDATE_DELAY, 0.1 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_UPDATE_DELAY = deterministicRandom()->random01() * 2;} init( FASTRESTORE_SCHED_UPDATE_DELAY, 0.1 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_UPDATE_DELAY = deterministicRandom()->random01() * 2;}
init( FASTRESTORE_SCHED_TARGET_CPU_PERCENT, 70 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_TARGET_CPU_PERCENT = deterministicRandom()->random01() * 100 + 50;} // simulate cpu usage can be larger than 100 init( FASTRESTORE_SCHED_TARGET_CPU_PERCENT, 70 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_TARGET_CPU_PERCENT = deterministicRandom()->random01() * 100 + 50;} // simulate cpu usage can be larger than 100
init( FASTRESTORE_SCHED_MAX_CPU_PERCENT, 90 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_MAX_CPU_PERCENT = FASTRESTORE_SCHED_TARGET_CPU_PERCENT + deterministicRandom()->random01() * 100;} init( FASTRESTORE_SCHED_MAX_CPU_PERCENT, 90 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_MAX_CPU_PERCENT = FASTRESTORE_SCHED_TARGET_CPU_PERCENT + deterministicRandom()->random01() * 100;}
init( FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS, 50 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 30;} init( FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS, 50 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 30 + 1;}
init( FASTRESTORE_SCHED_INFLIGHT_SEND_REQS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SEND_REQS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 10;} init( FASTRESTORE_SCHED_INFLIGHT_SEND_REQS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SEND_REQS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 10 + 1;}
init( FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 10;} init( FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 10 + 1;}
init( FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 15 + 1;} init( FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 15 + 1;}
init( FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH, 2 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 15 + 1;} init( FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH, 2 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 15 + 1;}
init( FASTRESTORE_NUM_TRACE_EVENTS, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_TRACE_EVENTS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 500 + 1;} init( FASTRESTORE_NUM_TRACE_EVENTS, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_TRACE_EVENTS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 500 + 1;}

View File

@ -70,6 +70,7 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
try { try {
state int curVBInflightReqs = 0; state int curVBInflightReqs = 0;
state int sendLoadParams = 0; state int sendLoadParams = 0;
state int lastLoadReqs = 0;
loop { loop {
TraceEvent(SevDebug, "FastRestoreLoaderDispatchRequests", self->id()) TraceEvent(SevDebug, "FastRestoreLoaderDispatchRequests", self->id())
.detail("SendingQueue", self->sendingQueue.size()) .detail("SendingQueue", self->sendingQueue.size())
@ -79,6 +80,8 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
.detail("InflightSendingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS) .detail("InflightSendingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS)
.detail("InflightLoadingReqs", self->inflightLoadingReqs) .detail("InflightLoadingReqs", self->inflightLoadingReqs)
.detail("InflightLoadingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS) .detail("InflightLoadingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS)
.detail("LastLoadFileRequests", lastLoadReqs)
.detail("LoadFileRequestsBatchThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE)
.detail("LastDispatchSendLoadParamReqsForCurrentVB", curVBInflightReqs) .detail("LastDispatchSendLoadParamReqsForCurrentVB", curVBInflightReqs)
.detail("LastDispatchSendLoadParamReqsForFutureVB", sendLoadParams) .detail("LastDispatchSendLoadParamReqsForFutureVB", sendLoadParams)
.detail("CpuUsage", self->cpuUsage) .detail("CpuUsage", self->cpuUsage)
@ -168,15 +171,24 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
} }
// Dispatch loading backup file requests // Dispatch loading backup file requests
int loadReqs = 0; lastLoadReqs = 0;
while (!self->loadingQueue.empty()) { while (!self->loadingQueue.empty()) {
if (loadReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE) { if (lastLoadReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE) {
break; break;
} }
loadReqs++;
const RestoreLoadFileRequest& req = self->loadingQueue.top(); const RestoreLoadFileRequest& req = self->loadingQueue.top();
self->addActor.send(handleLoadFileRequest(req, self)); if (req.batchIndex <= self->finishedBatch.get()) {
self->loadingQueue.pop(); TraceEvent(SevError, "FastRestoreLoaderDispatchRestoreLoadFileRequestTooOld")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("RequestBatchIndex", req.batchIndex);
req.reply.send(RestoreLoadFileReply(req.param, true));
self->loadingQueue.pop();
ASSERT(false); // Check if this ever happens easily
} else {
self->addActor.send(handleLoadFileRequest(req, self));
self->loadingQueue.pop();
lastLoadReqs++;
}
} }
if (self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) { if (self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
@ -595,11 +607,22 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
// Do not need to block on low memory usage because this actor should not increase memory usage. // Do not need to block on low memory usage because this actor should not increase memory usage.
ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
Reference<RestoreLoaderData> self) { Reference<RestoreLoaderData> self) {
state Reference<LoaderBatchData> batchData = self->batch[req.batchIndex]; state Reference<LoaderBatchData> batchData;
state Reference<LoaderBatchStatus> batchStatus = self->status[req.batchIndex]; state Reference<LoaderBatchStatus> batchStatus;
state bool isDuplicated = true; state bool isDuplicated = true;
if (req.batchIndex <= self->finishedBatch.get()) {
TraceEvent(SevWarn, "FastRestoreLoaderRestoreSendMutationsToAppliersRequestTooOld")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("RequestBatchIndex", req.batchIndex);
req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
return Void();
}
batchData = self->batch[req.batchIndex];
batchStatus = self->status[req.batchIndex];
ASSERT(batchData.isValid() && batchStatus.isValid()); ASSERT(batchData.isValid() && batchStatus.isValid());
// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
ASSERT(req.batchIndex > self->finishedBatch.get()); ASSERT(req.batchIndex > self->finishedBatch.get());
TraceEvent("FastRestoreLoaderPhaseSendMutations", self->id()) TraceEvent("FastRestoreLoaderPhaseSendMutations", self->id())
.detail("BatchIndex", req.batchIndex) .detail("BatchIndex", req.batchIndex)
@ -607,8 +630,6 @@ ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequ
.detail("LoaderSendStatus", batchStatus->toString()); .detail("LoaderSendStatus", batchStatus->toString());
// The VB must finish loading phase before it can send mutations; update finishedLoadingVB for scheduler // The VB must finish loading phase before it can send mutations; update finishedLoadingVB for scheduler
self->finishedLoadingVB = std::max(self->finishedLoadingVB, req.batchIndex); self->finishedLoadingVB = std::max(self->finishedLoadingVB, req.batchIndex);
// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
ASSERT(self->finishedBatch.get() < req.batchIndex);
// Ensure each file is sent exactly once by using batchStatus->sendAllLogs and batchStatus->sendAllRanges // Ensure each file is sent exactly once by using batchStatus->sendAllLogs and batchStatus->sendAllRanges
if (!req.useRangeFile) { if (!req.useRangeFile) {
@ -1220,19 +1241,21 @@ ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest re
if (self->finishedBatch.get() == req.batchIndex - 1) { if (self->finishedBatch.get() == req.batchIndex - 1) {
// Sanity check: All requests before and in this batchIndex must have been processed; otherwise, // Sanity check: All requests before and in this batchIndex must have been processed; otherwise,
// those requests may cause segmentation fault after applier remove the batch data // those requests may cause segmentation fault after applier remove the batch data
// TODO: Pop old requests while (!self->loadingQueue.empty() && self->loadingQueue.top().batchIndex <= req.batchIndex) {
if (!self->loadingQueue.empty() && self->loadingQueue.top().batchIndex <= req.batchIndex) {
// Still has pending requests from earlier batchIndex and current batchIndex, which should not happen // Still has pending requests from earlier batchIndex and current batchIndex, which should not happen
TraceEvent(SevError, "FastRestoreLoaderHasPendingLoadFileRequests") TraceEvent(SevWarn, "FastRestoreLoaderHasPendingLoadFileRequests")
.detail("PendingRequest", self->loadingQueue.top().toString()); .detail("PendingRequest", self->loadingQueue.top().toString());
self->loadingQueue.pop();
} }
if (!self->sendingQueue.empty() && self->sendingQueue.top().batchIndex <= req.batchIndex) { while (!self->sendingQueue.empty() && self->sendingQueue.top().batchIndex <= req.batchIndex) {
TraceEvent(SevError, "FastRestoreLoaderHasPendingSendRequests") TraceEvent(SevWarn, "FastRestoreLoaderHasPendingSendRequests")
.detail("PendingRequest", self->sendingQueue.top().toString()); .detail("PendingRequest", self->sendingQueue.top().toString());
self->sendingQueue.pop();
} }
if (!self->sendLoadParamQueue.empty() && self->sendLoadParamQueue.top().batchIndex <= req.batchIndex) { while (!self->sendLoadParamQueue.empty() && self->sendLoadParamQueue.top().batchIndex <= req.batchIndex) {
TraceEvent(SevError, "FastRestoreLoaderHasPendingSendLoadParamRequests") TraceEvent(SevWarn, "FastRestoreLoaderHasPendingSendLoadParamRequests")
.detail("PendingRequest", self->sendLoadParamQueue.top().toString()); .detail("PendingRequest", self->sendLoadParamQueue.top().toString());
self->sendLoadParamQueue.pop();
} }
self->finishedBatch.set(req.batchIndex); self->finishedBatch.set(req.batchIndex);