fixed many more bugs associated with running without remote logs

This commit is contained in:
Evan Tschannen 2018-01-17 17:03:17 -08:00
parent 8f58bdd1cd
commit 264dc44dfa
6 changed files with 51 additions and 33 deletions

View File

@ -468,6 +468,7 @@ struct DDTeamCollection {
Promise<Void> serverTrackerErrorOut; Promise<Void> serverTrackerErrorOut;
AsyncVar<int> recruitingStream; AsyncVar<int> recruitingStream;
Debouncer restartRecruiting; Debouncer restartRecruiting;
Promise<Void> hasTeams;
int healthyTeamCount; int healthyTeamCount;
PromiseStream<Void> zeroHealthyTeams; PromiseStream<Void> zeroHealthyTeams;
@ -561,6 +562,7 @@ struct DDTeamCollection {
// use keys, src, dest, metrics, priority, system load, etc.. to decide... // use keys, src, dest, metrics, priority, system load, etc.. to decide...
ACTOR Future<Void> getTeam( DDTeamCollection* self, GetTeamRequest req ) { ACTOR Future<Void> getTeam( DDTeamCollection* self, GetTeamRequest req ) {
try { try {
Void _ = wait( self->hasTeams.getFuture() );
Void _ = wait( self->checkBuildTeams( self ) ); Void _ = wait( self->checkBuildTeams( self ) );
// Select the best team // Select the best team
@ -728,6 +730,10 @@ struct DDTeamCollection {
addTeam(t->begin(), t->end() ); addTeam(t->begin(), t->end() );
} }
if( teams.size() && hasTeams.canBeSet() ) {
hasTeams.send(Void());
}
addSubsetOfEmergencyTeams(); addSubsetOfEmergencyTeams();
} }
@ -1034,6 +1040,10 @@ struct DDTeamCollection {
//Let all of these changes get worked out before responding to the get team request //Let all of these changes get worked out before responding to the get team request
Void _ = wait( delay(0, TaskDataDistributionLaunch) ); Void _ = wait( delay(0, TaskDataDistributionLaunch) );
if( self->teams.size() && self->hasTeams.canBeSet() ) {
self->hasTeams.send(Void());
}
return Void(); return Void();
} }

View File

@ -95,7 +95,7 @@ public:
if(hasBest) { if(hasBest) {
for(auto& t : tags) { for(auto& t : tags) {
if(t.locality == locality || t.locality == tagLocalitySpecial) { if(t.locality == locality || t.locality == tagLocalitySpecial || locality == tagLocalitySpecial) {
newLocations.push_back(bestLocationFor(t)); newLocations.push_back(bestLocationFor(t));
} }
} }

View File

@ -352,7 +352,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
} }
int bestSet = -1; int bestSet = -1;
for(int t = 0; t < tLogs.size(); t++) { for(int t = 0; t < tLogs.size(); t++) {
if(tLogs[t]->hasBest && (tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial)) { if(tLogs[t]->hasBest && (tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalitySpecial)) {
bestSet = t; bestSet = t;
break; break;
} }
@ -368,7 +368,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
for(int i = 0; i < oldLogData.size() && begin < oldLogData[i].epochEnd; i++) { for(int i = 0; i < oldLogData.size() && begin < oldLogData[i].epochEnd; i++) {
int bestOldSet = -1; int bestOldSet = -1;
for(int t = 0; t < oldLogData[i].tLogs.size(); t++) { for(int t = 0; t < oldLogData[i].tLogs.size(); t++) {
if(oldLogData[i].tLogs[t]->hasBest && (oldLogData[i].tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial)) { if(oldLogData[i].tLogs[t]->hasBest && (oldLogData[i].tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalitySpecial)) {
bestOldSet = t; bestOldSet = t;
break; break;
} }
@ -394,7 +394,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
} else { } else {
int bestSet = -1; int bestSet = -1;
for(int t = 0; t < tLogs.size(); t++) { for(int t = 0; t < tLogs.size(); t++) {
if(tLogs[t]->hasBest && (tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial)) { if(tLogs[t]->hasBest && (tLogs[t]->locality == tag.locality || tag.locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalitySpecial)) {
bestSet = t; bestSet = t;
break; break;
} }
@ -1175,8 +1175,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
if(configuration.remoteTLogReplicationFactor > 0) { if(configuration.remoteTLogReplicationFactor > 0) {
logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(), oldLogSystem, fRemoteWorkers, configuration, recoveryCount, minTag, remoteLocality); logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(), oldLogSystem, fRemoteWorkers, configuration, recoveryCount, minTag, remoteLocality);
} else { } else {
logSystem->remoteRecovery = Void(); logSystem->remoteRecovery = logSystem->recoveryComplete;
logSystem->remoteRecoveryComplete = Void(); logSystem->remoteRecoveryComplete = logSystem->recoveryComplete;
} }
return logSystem; return logSystem;

View File

@ -289,30 +289,34 @@ ACTOR Future<Void> newResolvers( Reference<MasterData> self, RecruitFromConfigur
} }
ACTOR Future<Void> newTLogServers( Reference<MasterData> self, RecruitFromConfigurationReply recr, Reference<ILogSystem> oldLogSystem, vector<Standalone<CommitTransactionRef>>* initialConfChanges ) { ACTOR Future<Void> newTLogServers( Reference<MasterData> self, RecruitFromConfigurationReply recr, Reference<ILogSystem> oldLogSystem, vector<Standalone<CommitTransactionRef>>* initialConfChanges ) {
state Optional<Key> primaryDcId = recr.remoteDcId == self->configuration.remoteDcId ? self->configuration.primaryDcId : self->configuration.remoteDcId; if(self->configuration.remoteTLogReplicationFactor > 0) {
if( !self->dcId_locality.count(primaryDcId) ) { state Optional<Key> primaryDcId = recr.remoteDcId == self->configuration.remoteDcId ? self->configuration.primaryDcId : self->configuration.remoteDcId;
TraceEvent("UnknownPrimaryDCID", self->dbgid).detail("found", self->dcId_locality.count(primaryDcId)).detail("primaryId", printable(primaryDcId)); if( !self->dcId_locality.count(primaryDcId) ) {
int8_t loc = self->getNextLocality(); TraceEvent(SevWarnAlways, "UnknownPrimaryDCID", self->dbgid).detail("found", self->dcId_locality.count(primaryDcId)).detail("primaryId", printable(primaryDcId));
Standalone<CommitTransactionRef> tr; int8_t loc = self->getNextLocality();
tr.set(tr.arena(), tagLocalityListKeyFor(primaryDcId), tagLocalityListValue(loc)); Standalone<CommitTransactionRef> tr;
initialConfChanges->push_back(tr); tr.set(tr.arena(), tagLocalityListKeyFor(primaryDcId), tagLocalityListValue(loc));
self->dcId_locality[primaryDcId] = loc; initialConfChanges->push_back(tr);
self->dcId_locality[primaryDcId] = loc;
}
if( !self->dcId_locality.count(recr.remoteDcId) ) {
TraceEvent(SevWarnAlways, "UnknownRemoteDCID", self->dbgid).detail("remoteFound", self->dcId_locality.count(recr.remoteDcId)).detail("remoteId", printable(recr.remoteDcId));
int8_t loc = self->getNextLocality();
Standalone<CommitTransactionRef> tr;
tr.set(tr.arena(), tagLocalityListKeyFor(recr.remoteDcId), tagLocalityListValue(loc));
initialConfChanges->push_back(tr);
self->dcId_locality[recr.remoteDcId] = loc;
}
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers = brokenPromiseToNever( self->clusterController.recruitRemoteFromConfiguration.getReply( RecruitRemoteFromConfigurationRequest( self->configuration, recr.remoteDcId ) ) );
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, fRemoteWorkers, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->dcId_locality[primaryDcId], self->dcId_locality[recr.remoteDcId] ) );
self->logSystem = newLogSystem;
} else {
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, Never(), self->configuration, self->cstate.myDBState.recoveryCount + 1, tagLocalitySpecial, tagLocalitySpecial ) );
self->logSystem = newLogSystem;
} }
if( self->configuration.remoteTLogReplicationFactor > 0 && !self->dcId_locality.count(recr.remoteDcId) ) {
TraceEvent("UnknownRemoteDCID", self->dbgid).detail("remoteFound", self->dcId_locality.count(recr.remoteDcId)).detail("remoteId", printable(recr.remoteDcId));
int8_t loc = self->getNextLocality();
Standalone<CommitTransactionRef> tr;
tr.set(tr.arena(), tagLocalityListKeyFor(recr.remoteDcId), tagLocalityListValue(loc));
initialConfChanges->push_back(tr);
self->dcId_locality[recr.remoteDcId] = loc;
}
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers = self->configuration.remoteTLogReplicationFactor > 0 ? brokenPromiseToNever( self->clusterController.recruitRemoteFromConfiguration.getReply( RecruitRemoteFromConfigurationRequest( self->configuration, recr.remoteDcId ) ) ) : Never();
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, fRemoteWorkers, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->dcId_locality[primaryDcId], self->dcId_locality[recr.remoteDcId] ) );
self->logSystem = newLogSystem;
return Void(); return Void();
} }
@ -547,9 +551,11 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
RecruitFromConfigurationRequest( self->configuration, self->lastEpochEnd==0 ) ) ) ); RecruitFromConfigurationRequest( self->configuration, self->lastEpochEnd==0 ) ) ) );
self->primaryDcId.clear(); self->primaryDcId.clear();
self->primaryDcId.push_back(recruits.remoteDcId == self->configuration.remoteDcId ? self->configuration.primaryDcId : self->configuration.remoteDcId);
self->remoteDcId.clear(); self->remoteDcId.clear();
self->remoteDcId.push_back(recruits.remoteDcId); if(recruits.remoteDcId.present()) {
self->primaryDcId.push_back(recruits.remoteDcId == self->configuration.remoteDcId ? self->configuration.primaryDcId : self->configuration.remoteDcId);
self->remoteDcId.push_back(recruits.remoteDcId);
}
TraceEvent("MasterRecoveryState", self->dbgid) TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::initializing_transaction_servers) .detail("StatusCode", RecoveryStatus::initializing_transaction_servers)

View File

@ -1,3 +1,4 @@
testTitle=RandomReadWriteTest testTitle=RandomReadWriteTest
testName=ConflictRange testName=ConflictRange
connectionFailuresDisableDuration=100000 connectionFailuresDisableDuration=100000
buggify=off

View File

@ -1,4 +1,5 @@
testTitle=RandomReadWriteTest testTitle=RandomReadWriteTest
testName=ConflictRange testName=ConflictRange
testReadYourWrites=true testReadYourWrites=true
connectionFailuresDisableDuration=100000 connectionFailuresDisableDuration=100000
buggify=off