From e5d53c863be8a4ed358e436dc733680f0087ab25 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 16 Mar 2020 10:29:17 -0700 Subject: [PATCH] report in status the number of active generations --- fdbserver/Status.actor.cpp | 9 ++++++++- fdbserver/masterserver.actor.cpp | 10 +++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index d1cab6de22..614ff349db 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -961,8 +961,9 @@ ACTOR static Future recoveryStateStatusFetcher(WorkerDetails state JsonBuilderObject message; try { + state Future activeGens = timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryGenerations") ) ), 1.0); TraceEventFields md = wait( timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) ); - state int mStatusCode = md.getInt("StatusCode"); + int mStatusCode = md.getInt("StatusCode"); if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END) throw attribute_not_found(); @@ -986,6 +987,12 @@ ACTOR static Future recoveryStateStatusFetcher(WorkerDetails // TODO: time_in_recovery: 0.5 // time_in_state: 0.1 + TraceEventFields md = wait(activeGens); + if(md.size()) { + int activeGenerations = md.getInt("ActiveGenerations"); + message["active_generations"] = activeGenerations; + } + } catch (Error &e){ if (e.code() == error_code_actor_cancelled) throw; diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 2b9cbf15f2..2a77359edc 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -1161,6 +1161,10 @@ ACTOR Future trackTlogRecovery( Reference self, Referencedbgid) + .detail("ActiveGenerations", 0) + .trackLatest("MasterRecoveryGenerations"); } else if( !newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED ) { self->recoveryState = RecoveryState::STORAGE_RECOVERED; TraceEvent("MasterRecoveryState", self->dbgid) @@ -1245,11 +1249,15 @@ ACTOR Future masterCore( Reference self ) { .detail("StatusCode", RecoveryStatus::locking_coordinated_state) .detail("Status", RecoveryStatus::names[RecoveryStatus::locking_coordinated_state]) .detail("TLogs", self->cstate.prevDBState.tLogs.size()) - .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) + .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size()) .detail("MyRecoveryCount", self->cstate.prevDBState.recoveryCount+2) .detail("ForceRecovery", self->forceRecovery) .trackLatest("MasterRecoveryState"); + TraceEvent("MasterRecoveryGenerations", self->dbgid) + .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size()) + .trackLatest("MasterRecoveryGenerations"); + if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_OVERRIDE) { if (self->cstate.myDBState.oldTLogData.size() >= CLIENT_KNOBS->MAX_GENERATIONS) { TraceEvent(SevError, "RecoveryStoppedTooManyOldGenerations").detail("OldGenerations", self->cstate.myDBState.oldTLogData.size())