diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 4b0aea9a3c..2e963bc7ef 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -2,6 +2,10 @@ Release Notes ############# +6.2.31 +====== +* Fix a rare invalid memory access on data distributor when snapshotting large clusters. This is a follow up to `PR #4076 `_. `(PR #4317) `_ + 6.2.30 ====== * A storage server which has fallen behind will deprioritize reads in order to catch up. This change causes some saturating workloads to experience high read latencies instead of high GRV latencies. `(PR #4218) `_ diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 22d95bcd9a..b4938656f4 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -237,7 +237,7 @@ ACTOR Future dataDistributionTracker(Reference in FutureStream> getAverageShardBytes, Promise readyToStart, Reference> anyZeroHealthyTeams, UID distributorId, KeyRangeMap* shards, - bool const* trackerCancelled); + bool* trackerCancelled); ACTOR Future dataDistributionQueue( Database cx, PromiseStream output, FutureStream input, diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 03fa937b64..e1d10cecc7 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -94,7 +94,7 @@ struct DataDistributionTracker { // The reference to trackerCancelled must be extracted by actors, // because by the time (trackerCancelled == true) this memory cannot // be accessed - bool const& trackerCancelled; + bool& trackerCancelled; // This class extracts the trackerCancelled reference from a DataDistributionTracker object // Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker @@ -123,7 +123,7 @@ struct DataDistributionTracker { PromiseStream const& output, Reference shardsAffectedByTeamFailure, Reference> anyZeroHealthyTeams, KeyRangeMap& shards, - bool const& trackerCancelled) + bool& trackerCancelled) : cx(cx), distributorId(distributorId), dbSizeEstimate(new AsyncVar()), systemSizeEstimate(0), maxShardSize(new AsyncVar>()), sizeChanges(false), readyToStart(readyToStart), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), anyZeroHealthyTeams(anyZeroHealthyTeams), @@ -131,6 +131,7 @@ struct DataDistributionTracker { ~DataDistributionTracker() { + trackerCancelled = true; //Cancel all actors so they aren't waiting on sizeChanged broken promise sizeChanges.clear(false); } @@ -901,7 +902,7 @@ ACTOR Future dataDistributionTracker(Reference in FutureStream> getAverageShardBytes, Promise readyToStart, Reference> anyZeroHealthyTeams, UID distributorId, KeyRangeMap* shards, - bool const* trackerCancelled) { + bool* trackerCancelled) { state DataDistributionTracker self(cx, distributorId, readyToStart, output, shardsAffectedByTeamFailure, anyZeroHealthyTeams, *shards, *trackerCancelled); state Future loggingTrigger = Void(); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index f7b1157af7..7f75637fb5 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1562,6 +1562,9 @@ int main(int argc, char* argv[]) { delete FLOW_KNOBS; delete SERVER_KNOBS; delete CLIENT_KNOBS; + FLOW_KNOBS = nullptr; + SERVER_KNOBS = nullptr; + CLIENT_KNOBS = nullptr; FlowKnobs* flowKnobs = new FlowKnobs; ClientKnobs* clientKnobs = new ClientKnobs; ServerKnobs* serverKnobs = new ServerKnobs; diff --git a/flow/Error.cpp b/flow/Error.cpp index a264db0a75..0eaa7b08fc 100644 --- a/flow/Error.cpp +++ b/flow/Error.cpp @@ -91,13 +91,25 @@ Error internal_error_impl(const char* a_nm, long long a, const char * op_nm, con return Error(error_code_internal_error); } -Error::Error(int error_code) - : error_code(error_code), flags(0) -{ +Error::Error(int error_code) : error_code(error_code), flags(0) { if (TRACE_SAMPLE()) TraceEvent(SevSample, "ErrorCreated").detail("ErrorCode", error_code); - //std::cout << "Error: " << error_code << std::endl; + // std::cout << "Error: " << error_code << std::endl; if (error_code >= 3000 && error_code < 6000) { - TraceEvent(SevError, "SystemError").error(*this).backtrace(); + { + TraceEvent te(SevError, "SystemError"); + te.error(*this).backtrace(); + if (error_code == error_code_unknown_error) { + auto exception = std::current_exception(); + if (exception) { + try { + std::rethrow_exception(exception); + } catch (std::exception& e) { + te.detail("StdException", e.what()); + } catch (...) { + } + } + } + } if (g_crashOnError) { flushOutputStreams(); flushTraceFileVoid(); diff --git a/flow/Trace.cpp b/flow/Trace.cpp index f2542196c4..2ee97b3996 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -919,7 +919,7 @@ bool TraceEvent::init() { detail("Severity", int(severity)); detail("Time", "0.000000"); timeIndex = fields.size() - 1; - if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) { + if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) { detail("DateTime", ""); } @@ -1142,7 +1142,7 @@ void TraceEvent::log() { if (enabled) { double time = TraceEvent::getCurrentTime(); fields.mutate(timeIndex).second = format("%.6f", time); - if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) { + if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) { fields.mutate(timeIndex + 1).second = TraceEvent::printRealTime(time); } @@ -1317,7 +1317,7 @@ void TraceBatch::dump() { TraceBatch::EventInfo::EventInfo(double time, const char *name, uint64_t id, const char *location) { fields.addField("Severity", format("%d", (int)SevInfo)); fields.addField("Time", format("%.6f", time)); - if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) { + if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) { fields.addField("DateTime", TraceEvent::printRealTime(time)); } fields.addField("Type", name); @@ -1328,7 +1328,7 @@ TraceBatch::EventInfo::EventInfo(double time, const char *name, uint64_t id, con TraceBatch::AttachInfo::AttachInfo(double time, const char *name, uint64_t id, uint64_t to) { fields.addField("Severity", format("%d", (int)SevInfo)); fields.addField("Time", format("%.6f", time)); - if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) { + if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) { fields.addField("DateTime", TraceEvent::printRealTime(time)); } fields.addField("Type", name); @@ -1339,7 +1339,7 @@ TraceBatch::AttachInfo::AttachInfo(double time, const char *name, uint64_t id, u TraceBatch::BuggifyInfo::BuggifyInfo(double time, int activated, int line, std::string file) { fields.addField("Severity", format("%d", (int)SevInfo)); fields.addField("Time", format("%.6f", time)); - if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) { + if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) { fields.addField("DateTime", TraceEvent::printRealTime(time)); } fields.addField("Type", "BuggifySection");