Merge pull request #4842 from RenxuanW/config

Improve logging on the current view of the database configuration that the cluster controller is using.
This commit is contained in:
A.J. Beamon 2021-06-07 11:28:40 -07:00 committed by GitHub
commit e02ef3b8d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 11 deletions

View File

@ -1692,20 +1692,37 @@ public:
if (req.configuration.regions.size() > 1) {
std::vector<RegionInfo> regions = req.configuration.regions;
if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
TraceEvent("CCSwitchPrimaryDc", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("OldPrimaryDcId", regions[0].dcId)
.detail("NewPrimaryDcId", regions[1].dcId);
std::swap(regions[0], regions[1]);
}
if (regions[1].dcId == clusterControllerDcId.get() &&
(!versionDifferenceUpdated || datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE)) {
if (regions[1].priority >= 0) {
TraceEvent("CCSwitchPrimaryDcVersionDifference", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("OldPrimaryDcId", regions[0].dcId)
.detail("NewPrimaryDcId", regions[1].dcId);
std::swap(regions[0], regions[1]);
} else {
TraceEvent(SevWarnAlways, "CCDcPriorityNegative")
.detail("DcId", regions[1].dcId)
.detail("Priority", regions[1].priority);
.detail("Priority", regions[1].priority)
.detail("FindWorkersInDc", regions[0].dcId)
.detail("Warning", "Failover did not happen but CC is in remote DC");
}
}
TraceEvent("CCFindWorkersForConfiguration", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("Region0DcId", regions[0].dcId)
.detail("Region1DcId", regions[1].dcId)
.detail("DatacenterVersionDifference", datacenterVersionDifference)
.detail("VersionDifferenceUpdated", versionDifferenceUpdated);
bool setPrimaryDesired = false;
try {
auto reply = findWorkersForConfigurationFromDC(req, regions[0].dcId);
@ -1719,6 +1736,10 @@ public:
} else if (regions[0].dcId == clusterControllerDcId.get()) {
return reply.get();
}
TraceEvent(SevWarn, "CCRecruitmentFailed", id)
.detail("Reason", "Recruited Txn system and CC are in different DCs")
.detail("CCDcId", clusterControllerDcId.get())
.detail("RecruitedTxnSystemDcId", regions[0].dcId);
throw no_more_servers();
} catch (Error& e) {
if (!goodRemoteRecruitmentTime.isReady() && regions[1].dcId != clusterControllerDcId.get()) {
@ -1728,7 +1749,9 @@ public:
if (e.code() != error_code_no_more_servers || regions[1].priority < 0) {
throw;
}
TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDC", id).error(e);
TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDc", id)
.detail("SetPrimaryDesired", setPrimaryDesired)
.error(e);
auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId);
if (!setPrimaryDesired) {
vector<Optional<Key>> dcPriority;

View File

@ -711,15 +711,10 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything(Referen
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::recruiting_transaction_servers)
.detail("Status", RecoveryStatus::names[RecoveryStatus::recruiting_transaction_servers])
.detail("RequiredTLogs", self->configuration.tLogReplicationFactor)
.detail("DesiredTLogs", self->configuration.getDesiredLogs())
.detail("Conf", self->configuration.toString())
.detail("RequiredCommitProxies", 1)
.detail("DesiredCommitProxies", self->configuration.getDesiredCommitProxies())
.detail("RequiredGrvProxies", 1)
.detail("DesiredGrvProxies", self->configuration.getDesiredGrvProxies())
.detail("RequiredResolvers", 1)
.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
.detail("StoreType", self->configuration.storageServerStoreType)
.trackLatest("MasterRecoveryState");
// FIXME: we only need log routers for the same locality as the master
@ -732,14 +727,25 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything(Referen
wait(brokenPromiseToNever(self->clusterController.recruitFromConfiguration.getReply(
RecruitFromConfigurationRequest(self->configuration, self->lastEpochEnd == 0, maxLogRouters))));
std::string primaryDcIds, remoteDcIds;
self->primaryDcId.clear();
self->remoteDcIds.clear();
if (recruits.dcId.present()) {
self->primaryDcId.push_back(recruits.dcId);
if (!primaryDcIds.empty()) {
primaryDcIds += ',';
}
primaryDcIds += printable(recruits.dcId);
if (self->configuration.regions.size() > 1) {
self->remoteDcIds.push_back(recruits.dcId.get() == self->configuration.regions[0].dcId
? self->configuration.regions[1].dcId
: self->configuration.regions[0].dcId);
Key remoteDcId = recruits.dcId.get() == self->configuration.regions[0].dcId
? self->configuration.regions[1].dcId
: self->configuration.regions[0].dcId;
self->remoteDcIds.push_back(remoteDcId);
if (!remoteDcIds.empty()) {
remoteDcIds += ',';
}
remoteDcIds += printable(remoteDcId);
}
}
self->backupWorkers.swap(recruits.backupWorkers);
@ -755,6 +761,8 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything(Referen
.detail("OldLogRouters", recruits.oldLogRouters.size())
.detail("StorageServers", recruits.storageServers.size())
.detail("BackupWorkers", self->backupWorkers.size())
.detail("PrimaryDcIds", primaryDcIds)
.detail("RemoteDcIds", remoteDcIds)
.trackLatest("MasterRecoveryState");
// Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand