Merge pull request #1649 from etschannen/feature-coordinator-bug
The coordinators did not always converge on the same leader
This commit is contained in:
commit
bb22ee7d37
|
@ -2,6 +2,14 @@
|
|||
Release Notes
|
||||
#############
|
||||
|
||||
6.1.9
|
||||
=====
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
||||
* Sometimes a minority of coordinators would not converge to the leader. `(PR #1649) <https://github.com/apple/foundationdb/pull/1649>`_
|
||||
|
||||
6.1.8
|
||||
=====
|
||||
|
||||
|
|
|
@ -392,6 +392,7 @@ struct Peer : NonCopyable {
|
|||
state ReplyPromise<Void> reply;
|
||||
FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePing.getEndpoint() );
|
||||
state int64_t startingBytes = peer->bytesReceived;
|
||||
state int timeouts = 0;
|
||||
loop {
|
||||
choose {
|
||||
when (wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) )) {
|
||||
|
@ -399,7 +400,11 @@ struct Peer : NonCopyable {
|
|||
TraceEvent("ConnectionTimeout").suppressFor(1.0).detail("WithAddr", peer->destination);
|
||||
throw connection_failed();
|
||||
}
|
||||
if(timeouts > 1) {
|
||||
TraceEvent(SevWarnAlways, "ConnectionSlowPing").suppressFor(1.0).detail("WithAddr", peer->destination).detail("Timeouts", timeouts);
|
||||
}
|
||||
startingBytes = peer->bytesReceived;
|
||||
timeouts++;
|
||||
}
|
||||
when (wait( reply.getFuture() )) {
|
||||
break;
|
||||
|
|
|
@ -271,38 +271,22 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
|
|||
return Void();
|
||||
} else {
|
||||
Optional<LeaderInfo> nextNominee;
|
||||
if (availableLeaders.size() && availableCandidates.size()) {
|
||||
nextNominee = ( *availableLeaders.begin() < *availableCandidates.begin() ) ? *availableLeaders.begin() : *availableCandidates.begin();
|
||||
} else if (availableLeaders.size()) {
|
||||
nextNominee = *availableLeaders.begin();
|
||||
} else if (availableCandidates.size()) {
|
||||
if( availableCandidates.size() && (!availableLeaders.size() || availableLeaders.begin()->leaderChangeRequired(*availableCandidates.begin())) ) {
|
||||
nextNominee = *availableCandidates.begin();
|
||||
} else {
|
||||
nextNominee = Optional<LeaderInfo>();
|
||||
} else if( availableLeaders.size() ) {
|
||||
nextNominee = *availableLeaders.begin();
|
||||
}
|
||||
|
||||
bool foundCurrentNominee = false;
|
||||
if(currentNominee.present()) {
|
||||
for(auto& it : availableLeaders) {
|
||||
if(currentNominee.get().equalInternalId(it)) {
|
||||
foundCurrentNominee = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !nextNominee.present() || !foundCurrentNominee || currentNominee.get().leaderChangeRequired(nextNominee.get()) ) {
|
||||
TraceEvent("NominatingLeader").detail("Nominee", nextNominee.present() ? nextNominee.get().changeID : UID())
|
||||
.detail("Changed", nextNominee != currentNominee).detail("Key", printable(key));
|
||||
if( !currentNominee.present() || !nextNominee.present() || !currentNominee.get().equalInternalId(nextNominee.get()) || nextNominee.get() > currentNominee.get() ) {
|
||||
TraceEvent("NominatingLeader").detail("NextNominee", nextNominee.present() ? nextNominee.get().changeID : UID())
|
||||
.detail("CurrentNominee", currentNominee.present() ? currentNominee.get().changeID : UID()).detail("Key", printable(key));
|
||||
for(unsigned int i=0; i<notify.size(); i++)
|
||||
notify[i].send( nextNominee );
|
||||
notify.clear();
|
||||
currentNominee = nextNominee;
|
||||
} else if (currentNominee.get().equalInternalId(nextNominee.get())) {
|
||||
// leader becomes better
|
||||
currentNominee = nextNominee;
|
||||
}
|
||||
|
||||
currentNominee = nextNominee;
|
||||
|
||||
if( availableLeaders.size() ) {
|
||||
nextInterval = delay( SERVER_KNOBS->POLLING_FREQUENCY );
|
||||
if(leaderIntervalCount++ > 5) {
|
||||
|
|
Loading…
Reference in New Issue