Recruit backup worker during master recovery

Right now recruit the same number as TLogs. The backup worker does nothing.
This commit is contained in:
Jingyu Zhou 2019-05-15 16:13:04 -07:00
parent eac49bca04
commit ece3cadf8e
5 changed files with 80 additions and 11 deletions

View File

@ -79,13 +79,44 @@ ACTOR Future<Void> pullAsyncData(BackupData* self) {
}
}
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t recoveryCount,
BackupInterface myInterface) {
loop {
bool isDisplaced =
((db->get().recoveryCount > recoveryCount && db->get().recoveryState != RecoveryState::UNINITIALIZED) ||
(db->get().recoveryCount == recoveryCount && db->get().recoveryState == RecoveryState::FULLY_RECOVERED));
if (isDisplaced) {
for (auto& log : db->get().logSystemConfig.tLogs) {
if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), myInterface.id())) {
isDisplaced = false;
break;
}
}
}
if (isDisplaced) {
for (auto& old : db->get().logSystemConfig.oldTLogs) {
for (auto& log : old.tLogs) {
if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), myInterface.id())) {
isDisplaced = false;
break;
}
}
}
}
if (isDisplaced) {
throw worker_removed();
}
wait(db->onChange());
}
}
ACTOR Future<Void> backupWorker(
BackupInterface interf, InitializeBackupRequest req,
Reference<AsyncVar<ServerDBInfo>> db)
{
state BackupData self(interf.id(), req);
state PromiseStream<Future<Void>> addActor;
state Future<Void> error = actorCollection( addActor.getFuture() );
state Future<Void> error = actorCollection(addActor.getFuture());
state Future<Void> dbInfoChange = Void();
TraceEvent("BackupWorkerStart", interf.id());
@ -93,19 +124,22 @@ ACTOR Future<Void> backupWorker(
addActor.send(pullAsyncData(&self));
loop choose {
when (wait(dbInfoChange)) {
when(wait(dbInfoChange)) {
dbInfoChange = db->onChange();
self.logSystem.set(ILogSystem::fromServerDBInfo(self.myId, db->get(), true));
}
when (HaltBackupRequest req = waitNext(interf.haltBackup.getFuture())) {
when(HaltBackupRequest req = waitNext(interf.haltBackup.getFuture())) {
req.reply.send(Void());
TraceEvent("BackupWorkerHalted", interf.id()).detail("ReqID", req.requesterID);
break;
}
when (wait(error)) {}
when(wait(checkRemoved(db, req.recoveryCount, interf))) {
TraceEvent("BackupWorkerRemoved", interf.id());
break;
}
when(wait(error)) {}
}
}
catch (Error& e) {
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) {
TraceEvent("BackupWorkerTerminated", interf.id()).error(e, true);
} else {

View File

@ -37,6 +37,7 @@ struct BackupInterface {
void initEndpoints() {}
UID id() const { return myId; }
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
UID getToken() const { return haltBackup.getEndpoint().token; }
bool operator== (const BackupInterface& r) const {
return id() == r.id();
}

View File

@ -18,6 +18,12 @@
* limitations under the License.
*/
#include <algorithm>
#include <iterator>
#include <map>
#include <set>
#include <vector>
#include "fdbrpc/FailureMonitor.h"
#include "flow/ActorCollection.h"
#include "fdbclient/NativeAPI.actor.h"
@ -35,7 +41,6 @@
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/Status.h"
#include "fdbserver/LatencyBandConfig.h"
#include <algorithm>
#include "fdbclient/DatabaseContext.h"
#include "fdbserver/RecoveryState.h"
#include "fdbclient/ReadYourWrites.h"
@ -775,6 +780,17 @@ public:
}
}
// TODO: revisit the number of workers. Consider the number of log routers?
auto backupWorkers =
getWorkersForRoleInDatacenter(dcId, ProcessClass::Backup, tlogs.size(), req.configuration, id_used);
std::transform(backupWorkers.begin(), backupWorkers.end(), std::back_inserter(result.backupWorkers),
[](const WorkerDetails& w) { return w.interf; });
auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, id_used );
for(int i = 0; i < oldLogRouters.size(); i++) {
result.oldLogRouters.push_back(oldLogRouters[i].interf);
}
if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY &&
( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
( region.satelliteTLogReplicationFactor > 0 && RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredSatelliteLogs(dcId), ProcessClass::TLog).betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog)) ) ||

View File

@ -82,6 +82,7 @@ struct TLogSet {
constexpr static FileIdentifier file_identifier = 6302317;
std::vector<OptionalInterface<TLogInterface>> tLogs;
std::vector<OptionalInterface<TLogInterface>> logRouters;
std::vector<OptionalInterface<BackupInterface>> backupWorkers;
int32_t tLogWriteAntiQuorum, tLogReplicationFactor;
std::vector< LocalityData > tLogLocalities; // Stores the localities of the log servers
TLogVersion tLogVersion;
@ -95,12 +96,16 @@ struct TLogSet {
explicit TLogSet(const LogSet& rhs);
std::string toString() const {
return format("anti: %d replication: %d local: %d routers: %d tLogs: %s locality: %d", tLogWriteAntiQuorum, tLogReplicationFactor, isLocal, logRouters.size(), describe(tLogs).c_str(), locality);
return format("anti: %d replication: %d local: %d routers: %d tLogs: %s backupWorkers: %s locality: %d",
tLogWriteAntiQuorum, tLogReplicationFactor, isLocal, logRouters.size(), describe(tLogs).c_str(),
backupWorkers.size(), locality);
}
bool operator == ( const TLogSet& rhs ) const {
if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor || isLocal != rhs.isLocal || satelliteTagLocations != rhs.satelliteTagLocations ||
startVersion != rhs.startVersion || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality || logRouters.size() != rhs.logRouters.size()) {
if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor ||
isLocal != rhs.isLocal || satelliteTagLocations != rhs.satelliteTagLocations ||
startVersion != rhs.startVersion || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality ||
logRouters.size() != rhs.logRouters.size() || backupWorkers.size() != rhs.backupWorkers.size()) {
return false;
}
if ((tLogPolicy && !rhs.tLogPolicy) || (!tLogPolicy && rhs.tLogPolicy) || (tLogPolicy && (tLogPolicy->info() != rhs.tLogPolicy->info()))) {
@ -116,6 +121,14 @@ struct TLogSet {
return false;
}
}
for (int j = 0; j < backupWorkers.size(); j++) {
if (backupWorkers[j].id() != rhs.backupWorkers[j].id() ||
backupWorkers[j].present() != rhs.backupWorkers[j].present() ||
(backupWorkers[j].present() &&
backupWorkers[j].interf().getToken() != rhs.backupWorkers[j].interf().getToken())) {
return false;
}
}
return true;
}
@ -149,6 +162,9 @@ struct TLogSet {
}
ASSERT(tLogPolicy.getPtr() == nullptr || tLogVersion != TLogVersion::UNSET);
}
if (ar.protocolVersion() > 0x0FDB00B061070001LL) {
serializer(ar, backupWorkers);
}
}
};

View File

@ -150,6 +150,8 @@ struct InitializeLogRouterRequest {
struct InitializeBackupRequest {
UID reqId;
uint64_t recoveryCount;
Version startVersion;
ReplyPromise<struct BackupInterface> reply;
InitializeBackupRequest() = default;
@ -157,7 +159,7 @@ struct InitializeBackupRequest {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reqId, reply);
serializer(ar, reqId, recoveryCount, startVersion, reply);
}
};