From adc0fea18c30299dcf712fc38a0e525fdb6e74b9 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Mon, 29 Aug 2022 17:29:28 -0500 Subject: [PATCH] Fix rare force purge and granule assignment race (#8018) * Fix rare force purge and granule assignment race * Adding missed transaction options --- fdbserver/BlobWorker.actor.cpp | 36 ++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index dc17efd7dd..582f17d31b 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -1839,6 +1839,20 @@ ACTOR Future waitVersionCommitted(Reference bwData, return Void(); } +ACTOR Future checkFileNotFoundForcePurgeRace(Reference bwData, KeyRange range) { + state Transaction tr(bwData->db); + loop { + try { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + ForcedPurgeState purgeState = wait(getForcePurgedState(&tr, range)); + return purgeState != ForcedPurgeState::NonePurged; + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + // updater for a single granule // TODO: this is getting kind of large. Should try to split out this actor if it continues to grow? ACTOR Future blobGranuleUpdateFiles(Reference bwData, @@ -2637,17 +2651,31 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, throw e; } + state Error e2 = e; + if (e.code() == error_code_file_not_found) { + // FIXME: better way to fix this? + bool isForcePurging = wait(checkFileNotFoundForcePurgeRace(bwData, metadata->keyRange)); + if (isForcePurging) { + CODE_PROBE(true, "Granule got file not found from force purge"); + TraceEvent("GranuleFileUpdaterFileNotFoundForcePurge", bwData->id) + .error(e2) + .detail("KeyRange", metadata->keyRange) + .detail("GranuleID", startState.granuleID); + return Void(); + } + } + TraceEvent(SevError, "GranuleFileUpdaterUnexpectedError", bwData->id) - .error(e) + .error(e2) .detail("Granule", metadata->keyRange) .detail("GranuleID", startState.granuleID); ASSERT_WE_THINK(false); // if not simulation, kill the BW if (bwData->fatalError.canBeSet()) { - bwData->fatalError.sendError(e); + bwData->fatalError.sendError(e2); } - throw e; + throw e2; } } @@ -4919,4 +4947,4 @@ ACTOR Future blobWorker(BlobWorkerInterface bwInterf, return Void(); } -// TODO add unit tests for assign/revoke range, especially version ordering \ No newline at end of file +// TODO add unit tests for assign/revoke range, especially version ordering