fix: it is not safe to drop logs supporting the current primary datacenter, because configuring usable_regions down will drop the storage servers in the remote region, leaving you will no remaining logs

This commit is contained in:
Evan Tschannen 2018-07-14 16:26:45 -07:00
parent 0f59dc4086
commit 30b2f85020
4 changed files with 13 additions and 10 deletions

View File

@ -780,7 +780,7 @@ public:
}
void checkRecoveryStalled() {
if(db.serverInfo->get().recoveryState < RecoveryState::RECOVERY_TRANSACTION && db.recoveryStalled ) {
if( (db.serverInfo->get().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) {
if(db.config.regions.size() > 1 && clusterControllerDcId.present()) {
auto regions = db.config.regions;
if(clusterControllerDcId.get() == regions[0].dcId) {

View File

@ -590,6 +590,8 @@ struct ILogSystem {
virtual void toCoreState( DBCoreState& ) = 0;
virtual bool remoteStorageRecovered() = 0;
virtual Future<Void> onCoreStateChanged() = 0;
// Returns if and when the output of toCoreState() would change (for example, when older logs can be discarded from the state)

View File

@ -283,14 +283,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
newState.oldTLogData.clear();
int recoveredCount = 0;
if(recoveryComplete.isValid() && recoveryComplete.isReady()) {
recoveredCount++;
}
if(remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady()) {
recoveredCount++;
}
if(recoveredCount < 2 - repopulateRegionAntiQuorum) {
if(!recoveryComplete.isValid() || !recoveryComplete.isReady() || (repopulateRegionAntiQuorum == 0 && (!remoteRecoveryComplete.isValid() || !remoteRecoveryComplete.isReady()))) {
newState.oldTLogData.resize(oldLogData.size());
for(int i = 0; i < oldLogData.size(); i++) {
for(auto &t : oldLogData[i].tLogs) {
@ -318,6 +311,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
newState.logSystemType = logSystemType;
}
virtual bool remoteStorageRecovered() {
return remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady();
}
virtual Future<Void> onCoreStateChanged() {
std::vector<Future<Void>> changes;
changes.push_back(Never());

View File

@ -1082,12 +1082,16 @@ ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<Asyn
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
}
if(newState.oldTLogData.size() && self->configuration.repopulateRegionAntiQuorum > 0 && self->logSystem->remoteStorageRecovered()) {
TraceEvent(SevWarnAlways, "RecruitmentStalled_RemoteStorageRecovered", self->dbgid);
self->recruitmentStalled->set(true);
}
self->registrationTrigger.trigger();
if(allLogs && remoteRecovered.canBeSet()) {
remoteRecovered.send(Void());
}
if( finalUpdate ) {
oldLogSystems->get()->stopRejoins();
rejoinRequests = rejoinRequestHandler(self);