Merge pull request #2814 from etschannen/feature-delay-recovery
Prevent coordinated state from filling up with too many old generations
This commit is contained in:
commit
f2defc3a3a
|
@ -402,6 +402,7 @@
|
|||
},
|
||||
"required_logs":3,
|
||||
"missing_logs":"7f8d623d0cb9966e",
|
||||
"active_generations":1,
|
||||
"description":"Recovery complete."
|
||||
},
|
||||
"workload":{
|
||||
|
|
|
@ -41,6 +41,10 @@ ClientKnobs::ClientKnobs(bool randomize) {
|
|||
init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY );
|
||||
init( FAILURE_EMERGENCY_DELAY, 30.0 );
|
||||
init( FAILURE_MAX_GENERATIONS, 10 );
|
||||
init( RECOVERY_DELAY_START_GENERATION, 70 );
|
||||
init( RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0 );
|
||||
init( MAX_GENERATIONS, 100 );
|
||||
init( MAX_GENERATIONS_OVERRIDE, 0 );
|
||||
|
||||
init( COORDINATOR_RECONNECTION_DELAY, 1.0 );
|
||||
init( CLIENT_EXAMPLE_AMOUNT, 20 );
|
||||
|
|
|
@ -40,6 +40,10 @@ public:
|
|||
double CLIENT_FAILURE_TIMEOUT_DELAY;
|
||||
double FAILURE_EMERGENCY_DELAY;
|
||||
double FAILURE_MAX_GENERATIONS;
|
||||
double RECOVERY_DELAY_START_GENERATION;
|
||||
double RECOVERY_DELAY_SECONDS_PER_GENERATION;
|
||||
double MAX_GENERATIONS;
|
||||
double MAX_GENERATIONS_OVERRIDE;
|
||||
|
||||
double COORDINATOR_RECONNECTION_DELAY;
|
||||
int CLIENT_EXAMPLE_AMOUNT;
|
||||
|
|
|
@ -428,6 +428,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
|
|||
},
|
||||
"required_logs":3,
|
||||
"missing_logs":"7f8d623d0cb9966e",
|
||||
"active_generations":1,
|
||||
"description":"Recovery complete."
|
||||
},
|
||||
"workload":{
|
||||
|
|
|
@ -965,8 +965,9 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
|
|||
state JsonBuilderObject message;
|
||||
|
||||
try {
|
||||
state Future<TraceEventFields> activeGens = timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryGenerations") ) ), 1.0);
|
||||
TraceEventFields md = wait( timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) );
|
||||
state int mStatusCode = md.getInt("StatusCode");
|
||||
int mStatusCode = md.getInt("StatusCode");
|
||||
if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END)
|
||||
throw attribute_not_found();
|
||||
|
||||
|
@ -990,6 +991,12 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
|
|||
// TODO: time_in_recovery: 0.5
|
||||
// time_in_state: 0.1
|
||||
|
||||
TraceEventFields md = wait(activeGens);
|
||||
if(md.size()) {
|
||||
int activeGenerations = md.getInt("ActiveGenerations");
|
||||
message["active_generations"] = activeGenerations;
|
||||
}
|
||||
|
||||
} catch (Error &e){
|
||||
if (e.code() == error_code_actor_cancelled)
|
||||
throw;
|
||||
|
|
|
@ -1161,6 +1161,10 @@ ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<Asyn
|
|||
.detail("StatusCode", RecoveryStatus::fully_recovered)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
TraceEvent("MasterRecoveryGenerations", self->dbgid)
|
||||
.detail("ActiveGenerations", 1)
|
||||
.trackLatest("MasterRecoveryGenerations");
|
||||
} else if( !newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED ) {
|
||||
self->recoveryState = RecoveryState::STORAGE_RECOVERED;
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
|
@ -1245,10 +1249,27 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
|
|||
.detail("StatusCode", RecoveryStatus::locking_coordinated_state)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::locking_coordinated_state])
|
||||
.detail("TLogs", self->cstate.prevDBState.tLogs.size())
|
||||
.detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1)
|
||||
.detail("MyRecoveryCount", self->cstate.prevDBState.recoveryCount+2)
|
||||
.detail("ForceRecovery", self->forceRecovery)
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
TraceEvent("MasterRecoveryGenerations", self->dbgid)
|
||||
.detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1)
|
||||
.trackLatest("MasterRecoveryGenerations");
|
||||
|
||||
if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_OVERRIDE) {
|
||||
if (self->cstate.myDBState.oldTLogData.size() >= CLIENT_KNOBS->MAX_GENERATIONS) {
|
||||
TraceEvent(SevError, "RecoveryStoppedTooManyOldGenerations").detail("OldGenerations", self->cstate.myDBState.oldTLogData.size())
|
||||
.detail("Reason", "Recovery stopped because too many recoveries have happened since the last time the cluster was fully_recovered. Set --knob_max_generations_override on your server processes to a value larger than OldGenerations to resume recovery once the underlying problem has been fixed.");
|
||||
wait(Future<Void>(Never()));
|
||||
} else if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION) {
|
||||
TraceEvent(SevError, "RecoveryDelayedTooManyOldGenerations").detail("OldGenerations", self->cstate.myDBState.oldTLogData.size())
|
||||
.detail("Reason", "Recovery is delayed because too many recoveries have happened since the last time the cluster was fully_recovered. Set --knob_max_generations_override on your server processes to a value larger than OldGenerations to resume recovery once the underlying problem has been fixed.");
|
||||
wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION*(self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION)));
|
||||
}
|
||||
}
|
||||
|
||||
state Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems( new AsyncVar<Reference<ILogSystem>> );
|
||||
state Future<Void> recoverAndEndEpoch = ILogSystem::recoverAndEndEpoch(oldLogSystems, self->dbgid, self->cstate.prevDBState, self->myInterface.tlogRejoin.getFuture(), self->myInterface.locality, &self->forceRecovery);
|
||||
|
||||
|
|
Loading…
Reference in New Issue