Merge pull request #3320 from ajbeamon/backport-region-config-status-changes

Backport region config status changes to release-6.2
This commit is contained in:
Evan Tschannen 2020-06-15 14:23:13 -07:00 committed by GitHub
commit f6f9fb1147
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 120 additions and 3 deletions

View File

@ -494,6 +494,7 @@
"data_distribution_disabled_for_ss_failures":true,
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,

View File

@ -2,6 +2,14 @@
Release Notes
#############
6.2.23
======
Status
------
* Added ``cluster.active_primary_dc`` that indicates which datacenter is serving as the primary datacenter in multi-region setups. `(PR #3320) <https://github.com/apple/foundationdb/pull/3320>`_
6.2.22
======

View File

@ -945,7 +945,11 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
StatusObjectReader statusObjConfig;
StatusArray excludedServersArr;
Optional<std::string> activePrimaryDC;
if (statusObjCluster.has("active_primary_dc")) {
activePrimaryDC = statusObjCluster["active_primary_dc"].get_str();
}
if (statusObjCluster.get("configuration", statusObjConfig)) {
if (statusObjConfig.has("excluded_servers"))
excludedServersArr = statusObjConfig.last().get_array();
@ -1001,6 +1005,73 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
if (statusObjConfig.get("log_routers", intVal))
outputString += format("\n Desired Log Routers - %d", intVal);
outputString += "\n Usable Regions - ";
if (statusObjConfig.get("usable_regions", intVal)) {
outputString += std::to_string(intVal);
} else {
outputString += "unknown";
}
StatusArray regions;
if (statusObjConfig.has("regions")) {
outputString += "\n Regions: ";
regions = statusObjConfig["regions"].get_array();
bool isPrimary = false;
std::vector<std::string> regionSatelliteDCs;
std::string regionDC;
for (StatusObjectReader region : regions) {
for (StatusObjectReader dc : region["datacenters"].get_array()) {
if (!dc.has("satellite")) {
regionDC = dc["id"].get_str();
if (activePrimaryDC.present() && dc["id"].get_str() == activePrimaryDC.get()) {
isPrimary = true;
}
} else if (dc["satellite"].get_int() == 1) {
regionSatelliteDCs.push_back(dc["id"].get_str());
}
}
if (activePrimaryDC.present()) {
if (isPrimary) {
outputString += "\n Primary -";
} else {
outputString += "\n Remote -";
}
} else {
outputString += "\n Region -";
}
outputString += format("\n Datacenter - %s", regionDC.c_str());
if (regionSatelliteDCs.size() > 0) {
outputString += "\n Satellite datacenters - ";
for (int i = 0; i < regionSatelliteDCs.size(); i++) {
if (i != regionSatelliteDCs.size() - 1) {
outputString += format("%s, ", regionSatelliteDCs[i].c_str());
} else {
outputString += format("%s", regionSatelliteDCs[i].c_str());
}
}
}
isPrimary = false;
if (region.get("satellite_redundancy_mode", strVal)) {
outputString += format("\n Satellite Redundancy Mode - %s", strVal.c_str());
}
if (region.get("satellite_anti_quorum", intVal)) {
outputString += format("\n Satellite Anti Quorum - %d", intVal);
}
if (region.get("satellite_logs", intVal)) {
outputString += format("\n Satellite Logs - %d", intVal);
}
if (region.get("satellite_log_policy", strVal)) {
outputString += format("\n Satellite Log Policy - %s", strVal.c_str());
}
if (region.get("satellite_log_replicas", intVal)) {
outputString += format("\n Satellite Log Replicas - %d", intVal);
}
if (region.get("satellite_usable_dcs", intVal)) {
outputString += format("\n Satellite Usable DCs - %d", intVal);
}
}
}
}
catch (std::runtime_error& ) {
outputString = outputStringCache;

View File

@ -520,6 +520,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"data_distribution_disabled_for_ss_failures":true,
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,

View File

@ -2145,6 +2145,35 @@ ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<CachedSer
return statusObj;
}
ACTOR Future<Optional<Value>> getActivePrimaryDC(Database cx, JsonBuilderArray* messages) {
state ReadYourWritesTransaction tr(cx);
state Future<Void> readTimeout = delay(5); // so that we won't loop forever
loop {
try {
if (readTimeout.isReady()) {
throw timed_out();
}
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> res = wait(timeoutError(tr.get(primaryDatacenterKey), 5));
if (!res.present()) {
messages->push_back(
JsonString::makeMessage("primary_dc_missing", "Unable to determine primary datacenter."));
}
return res;
} catch (Error& e) {
if (e.code() == error_code_timed_out) {
messages->push_back(
JsonString::makeMessage("fetch_primary_dc_timedout", "Fetching primary DC timed out."));
return Optional<Value>();
} else {
wait(tr.onError(e));
}
}
}
}
// constructs the cluster section of the json status output
ACTOR Future<StatusReply> clusterGetStatus(
Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db,
@ -2323,6 +2352,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
state Future<ErrorOr<vector<std::pair<MasterProxyInterface, EventMap>>>> proxyFuture = errorOr(getProxiesAndMetrics(db, address_workers));
state int minReplicasRemaining = -1;
state Future<Optional<Value>> primaryDCFO = getActivePrimaryDC(cx, &messages);
std::vector<Future<JsonBuilderObject>> futures2;
futures2.push_back(dataStatusFetcher(ddWorker, configuration.get(), &minReplicasRemaining));
futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture));
@ -2341,11 +2371,17 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining, loadResult.present() && loadResult.get().healthyZone.present());
}
JsonBuilderObject configObj = configurationFetcher(configuration, coordinators, &status_incomplete_reasons);
state JsonBuilderObject configObj =
configurationFetcher(configuration, coordinators, &status_incomplete_reasons);
wait(success(primaryDCFO));
if (primaryDCFO.get().present()) {
statusObj["active_primary_dc"] = primaryDCFO.get().get();
}
// configArr could be empty
if (!configObj.empty())
if (!configObj.empty()) {
statusObj["configuration"] = configObj;
}
// workloadStatusFetcher returns the workload section but also optionally writes the qos section and adds to the data_overlay object
if (!workerStatuses[1].empty())

View File

@ -19,7 +19,7 @@ struct TriggerRecoveryLoopWorkload : TestWorkload {
numRecoveries = getOption(options, LiteralStringRef("numRecoveries"), deterministicRandom()->randomInt(1, 10));
delayBetweenRecoveries = getOption(options, LiteralStringRef("delayBetweenRecoveries"), 0.0);
killAllProportion = getOption(options, LiteralStringRef("killAllProportion"), 0.1);
ASSERT(numRecoveries > 0 && startTime >= 0 and delayBetweenRecoveries >= 0);
ASSERT((numRecoveries > 0) && (startTime >= 0) && (delayBetweenRecoveries >= 0));
TraceEvent(SevInfo, "TriggerRecoveryLoopSetup")
.detail("StartTime", startTime)
.detail("NumRecoveries", numRecoveries)