grpclb: should not ignore subchannels with CONNECTING state in aggregating the overall LB state (#7959)

We should treat both IDLE and CONNECTING subchannels as "connection in progress" when aggregating for the overall load balancing state. Otherwise, RPCs could fail prematurely if one subchannel enters TF while all the others are still in CONNECTING.

23d279660c made each individual subchannel stay in TF until READY if it previously was in TF. So subchannels with CONNECTING state are those in first time connecting. We should give them time to connect.
This commit is contained in:
Chengyuan Zhang 2021-03-11 16:48:01 -08:00 committed by GitHub
parent afe883119d
commit 9c562c8a6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 6 deletions

View File

@ -776,7 +776,7 @@ final class GrpclbState {
case ROUND_ROBIN:
pickList = new ArrayList<>(backendList.size());
Status error = null;
boolean hasIdle = false;
boolean hasPending = false;
for (BackendEntry entry : backendList) {
Subchannel subchannel = entry.subchannel;
Attributes attrs = subchannel.getAttributes();
@ -785,12 +785,12 @@ final class GrpclbState {
pickList.add(entry);
} else if (stateInfo.getState() == TRANSIENT_FAILURE) {
error = stateInfo.getStatus();
} else if (stateInfo.getState() == IDLE) {
hasIdle = true;
} else {
hasPending = true;
}
}
if (pickList.isEmpty()) {
if (error != null && !hasIdle) {
if (error != null && !hasPending) {
pickList.add(new ErrorEntry(error));
state = TRANSIENT_FAILURE;
} else {

View File

@ -1175,9 +1175,10 @@ public class GrpclbLoadBalancerTest {
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class));
// Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error1");
// Switch all subchannels to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error");
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error));
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forTransientFailure(error));
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new ErrorEntry(error));