This resolves issue #3739 by exposing time since last full recovery.

This commit is contained in:
Xin Dong 2020-09-08 14:26:01 -07:00
parent cc0db5452a
commit 4363dd0f25
3 changed files with 22 additions and 3 deletions

View File

@ -482,6 +482,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
)statusSchema"
R"statusSchema(
"recovery_state":{
"time_since_last_fully_recovered_seconds":1,
"required_resolvers":1,
"required_proxies":1,
"required_grv_proxies":1,

View File

@ -1028,8 +1028,13 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
state JsonBuilderObject message;
try {
state Future<TraceEventFields> activeGens = timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryGenerations") ) ), 1.0);
TraceEventFields md = wait( timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) );
std::vector<Future<TraceEventFields>> futures;
futures.push_back(timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryGenerations") ) ), 1.0));
futures.push_back(timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryFullyRecovered") ) ), 1.0));
futures.push_back(timeoutError(mWorker.interf.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0));
std::vector<TraceEventFields> msgs = wait(getAll(futures));
const TraceEventFields& md = msgs[2];
int mStatusCode = md.getInt("StatusCode");
if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END)
throw attribute_not_found();
@ -1037,6 +1042,17 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
message = JsonString::makeMessage(RecoveryStatus::names[mStatusCode], RecoveryStatus::descriptions[mStatusCode]);
*statusCode = mStatusCode;
const TraceEventFields& mLastRecoveryMsg = msgs[1];
std::string lastFullyRecoveredTimeS;
if (mLastRecoveryMsg.tryGetValue("Time", lastFullyRecoveredTimeS)) {
double lastFullyRecoveredTime = atof(lastFullyRecoveredTimeS.c_str());
// `lastFullyRecoveredTime` is the timestamp taken on master so the time interval calculated below may not
// be accurate due to the clock skew across the network, but it's good enough for the purpose it's used.
message["time_since_last_fully_recovered_seconds"] = now() - lastFullyRecoveredTime;
} else {
message["time_since_last_fully_recovered_seconds"] = -1;
}
// Add additional metadata for certain statuses
if (mStatusCode == RecoveryStatus::recruiting_transaction_servers) {
int requiredLogs = atoi( md.getValue("RequiredTLogs").c_str() );
@ -1056,7 +1072,7 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
// TODO: time_in_recovery: 0.5
// time_in_state: 0.1
TraceEventFields mdActiveGens = wait(activeGens);
const TraceEventFields& mdActiveGens = msgs[0];
if(mdActiveGens.size()) {
int activeGenerations = mdActiveGens.getInt("ActiveGenerations");
message["active_generations"] = activeGenerations;

View File

@ -1276,6 +1276,8 @@ ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<Asyn
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
.trackLatest("MasterRecoveryState");
TraceEvent("MasterRecoveryFullyRecovered").trackLatest("MasterRecoveryFullyRecovered");
TraceEvent("MasterRecoveryGenerations", self->dbgid)
.detail("ActiveGenerations", 1)
.trackLatest("MasterRecoveryGenerations");