Report missing old tlogs in recovery between accepting commits and storage recovered

This commit is contained in:
Young Liu 2020-08-31 07:49:59 -07:00
parent 1ee40848df
commit 23e1ff694c
2 changed files with 30 additions and 1 deletions

View File

@ -278,6 +278,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"address":"1.2.3.4:1234"
}
],
"epoch": {
"epoch": 1,
"epoch_begin": 23,
"epoch_end": 112315141
},
"missing_logs": [
{
"id":"6f8d623d0cb9966f",
"healthy":false,
"address":"1.2.3.5:1234"
}
],
"log_replication_factor":3,
"log_write_anti_quorum":0,
"log_fault_tolerance":2,
@ -288,6 +300,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"satellite_log_fault_tolerance":2
}
],
"possibly_losing_old_logs_data": true,
"fault_tolerance":{
"max_zone_failures_without_losing_availability":0,
"max_zone_failures_without_losing_data":0

View File

@ -1910,11 +1910,11 @@ ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(WorkerEve
static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
JsonBuilderArray oldTlogsArray;
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
for(auto it : db->get().logSystemConfig.oldTLogs) {
JsonBuilderObject statusObj;
JsonBuilderArray logsObj;
JsonBuilderArray failedLogsObj;
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance, log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor, remote_log_fault_tolerance;
int maxFaultTolerance = 0;
@ -1932,6 +1932,7 @@ static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<Asyn
logsObj.push_back(logObj);
if(failed) {
failedLogs++;
failedLogsObj.push_back(logObj);
}
}
maxFaultTolerance = std::max(maxFaultTolerance, it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs);
@ -1953,6 +1954,18 @@ static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<Asyn
*oldLogFaultTolerance = std::min(*oldLogFaultTolerance, maxFaultTolerance);
statusObj["logs"] = logsObj;
JsonBuilderObject epochInfo;
epochInfo["epoch"] = it.epoch;
epochInfo["epoch_begin"] = it.epochBegin;
epochInfo["epoch_end"] = it.epochEnd;
statusObj["epoch"] = epochInfo;
// We may lose logs in this log generation, storage servers may never be able to catch up this log
// generation.
if (maxFaultTolerance < 0) {
statusObj["missing_logs"] = failedLogsObj;
}
if (sat_log_replication_factor.present())
statusObj["satellite_log_replication_factor"] = sat_log_replication_factor.get();
if (sat_log_write_anti_quorum.present())
@ -2419,6 +2432,9 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers);
}
// Used as a signal that storage servers may not be able to catch up certain log generations
statusObj["possibly_losing_old_logs_data"] = oldLogFaultTolerance < 0;
if(configuration.present()) {
int extraTlogEligibleZones = getExtraTLogEligibleZones(workers, configuration.get());
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining, loadResult.present() && loadResult.get().healthyZone.present());