Merge pull request #4518 from halfprice/zhewu/log-tlog-recruitment-failure-reason

Logging more detailed information during Tlog recruitment
This commit is contained in:
Meng Xu 2021-03-19 11:36:05 -07:00 committed by GitHub
commit 0cedef123b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 59 additions and 11 deletions

View File

@ -320,6 +320,17 @@ public:
return results;
}
// Selects workers as TLogs from available workers based on input parameters.
// conf: the database configuration.
// required: the required number of TLog workers to select.
// desired: the desired number of TLog workers to select.
// policy: the TLog replication policy the selection needs to satisfy.
// id_used: keep track of process IDs of selected workers.
// checkStable: when true, only select from workers that are considered as stable worker (not rebooted more than
// twice recently).
// dcIds: the target data centers the workers are in. The selected workers must all be from these
// data centers:
// exclusionWorkerIds: the workers to be excluded from the selection.
std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
int32_t required,
int32_t desired,
@ -346,19 +357,56 @@ public:
logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
for (auto& it : id_worker) {
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), it.second.details.interf.id()) ==
// Populate `unavailableLocals` and log the reason why the worker is considered as unavailable.
auto logWorkerUnavailable = [this, &unavailableLocals](const std::string& reason,
const WorkerDetails& details,
ProcessClass::Fitness fitness) {
unavailableLocals.push_back(details.interf.locality);
// Note that the recruitment happens only during initial database creation and recovery. So these trace
// events should be sparse.
// TODO(zhewu): Add targeting dcids.
TraceEvent("GetTLogTeamWorkerUnavailable", id)
.detail("Reason", reason)
.detail("WorkerID", details.interf.id())
.detail("WorkerDC", details.interf.locality.dcId())
.detail("Address", details.interf.addresses().toString())
.detail("Fitness", fitness);
};
// Go through all the workers to list all the workers that can be recruited.
for (const auto& [worker_process_id, worker_info] : id_worker) {
const auto& worker_details = worker_info.details;
auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
exclusionWorkerIds.end()) {
auto fitness = it.second.details.processClass.machineClassFitness(ProcessClass::TLog);
if (workerAvailable(it.second, checkStable) &&
!conf.isExcludedServer(it.second.details.interf.addresses()) &&
fitness != ProcessClass::NeverAssign &&
(!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
fitness_workers[std::make_pair(fitness, it.second.details.degraded)].push_back(it.second.details);
} else {
unavailableLocals.push_back(it.second.details.interf.locality);
}
logWorkerUnavailable("Worker is excluded", worker_details, fitness);
continue;
}
if (!workerAvailable(worker_info, checkStable)) {
logWorkerUnavailable("Worker is not available", worker_details, fitness);
continue;
}
if (conf.isExcludedServer(worker_details.interf.addresses())) {
logWorkerUnavailable("Worker server is excluded from the cluster", worker_details, fitness);
continue;
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable("Worker's fitness is NeverAssign", worker_details, fitness);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
logWorkerUnavailable("Worker is not in the target DC", worker_details, fitness);
continue;
}
// This worker is a candidate for TLog recruitment.
fitness_workers[std::make_pair(fitness, worker_details.degraded)].push_back(worker_details);
}
results.reserve(results.size() + id_worker.size());