Merge pull request #4518 from halfprice/zhewu/log-tlog-recruitment-failure-reason

Logging more detailed information during Tlog recruitment
2021-03-19 11:36:05 -07:00 · 2021-03-19 11:36:05 -07:00 · 0cedef123b
parent 4b8eef20cf 58d9f47782
commit 0cedef123b
1 changed files with 59 additions and 11 deletions
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -320,6 +320,17 @@ public:
 		return results;
 	}

+	// Selects workers as TLogs from available workers based on input parameters.
+	//   conf:        the database configuration.
+	//   required:    the required number of TLog workers to select.
+	//   desired:     the desired number of TLog workers to select.
+	//   policy:      the TLog replication policy the selection needs to satisfy.
+	//   id_used:     keep track of process IDs of selected workers.
+	//   checkStable: when true, only select from workers that are considered as stable worker (not rebooted more than
+	//                twice recently).
+	//   dcIds:       the target data centers the workers are in. The selected workers must all be from these
+	//                data centers:
+	//   exclusionWorkerIds: the workers to be excluded from the selection.
 	std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
 	                                              int32_t required,
 	                                              int32_t desired,
@ -346,19 +357,56 @@ public:

 		logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
 		logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
-		for (auto& it : id_worker) {
-			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), it.second.details.interf.id()) ==
+
+		// Populate `unavailableLocals` and log the reason why the worker is considered as unavailable.
+		auto logWorkerUnavailable = [this, &unavailableLocals](const std::string& reason,
+		                                                       const WorkerDetails& details,
+		                                                       ProcessClass::Fitness fitness) {
+			unavailableLocals.push_back(details.interf.locality);
+
+			// Note that the recruitment happens only during initial database creation and recovery. So these trace
+			// events should be sparse.
+			// TODO(zhewu): Add targeting dcids.
+			TraceEvent("GetTLogTeamWorkerUnavailable", id)
+			    .detail("Reason", reason)
+			    .detail("WorkerID", details.interf.id())
+			    .detail("WorkerDC", details.interf.locality.dcId())
+			    .detail("Address", details.interf.addresses().toString())
+			    .detail("Fitness", fitness);
+		};
+
+		// Go through all the workers to list all the workers that can be recruited.
+		for (const auto& [worker_process_id, worker_info] : id_worker) {
+			const auto& worker_details = worker_info.details;
+			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
 			    exclusionWorkerIds.end()) {
-				auto fitness = it.second.details.processClass.machineClassFitness(ProcessClass::TLog);
-				if (workerAvailable(it.second, checkStable) &&
-				    !conf.isExcludedServer(it.second.details.interf.addresses()) &&
-				    fitness != ProcessClass::NeverAssign &&
-				    (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
-					fitness_workers[std::make_pair(fitness, it.second.details.degraded)].push_back(it.second.details);
-				} else {
-					unavailableLocals.push_back(it.second.details.interf.locality);
-				}
+				logWorkerUnavailable("Worker is excluded", worker_details, fitness);
+				continue;
 			}
+
+			if (!workerAvailable(worker_info, checkStable)) {
+				logWorkerUnavailable("Worker is not available", worker_details, fitness);
+				continue;
+			}
+
+			if (conf.isExcludedServer(worker_details.interf.addresses())) {
+				logWorkerUnavailable("Worker server is excluded from the cluster", worker_details, fitness);
+				continue;
+			}
+
+			if (fitness == ProcessClass::NeverAssign) {
+				logWorkerUnavailable("Worker's fitness is NeverAssign", worker_details, fitness);
+				continue;
+			}
+
+			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
+				logWorkerUnavailable("Worker is not in the target DC", worker_details, fitness);
+				continue;
+			}
+
+			// This worker is a candidate for TLog recruitment.
+			fitness_workers[std::make_pair(fitness, worker_details.degraded)].push_back(worker_details);
 		}

 		results.reserve(results.size() + id_worker.size());