grpclb: keep RR Subchannel state in TRANSIENT_FAILURE until becoming READY (#7816)

If all RR servers are unhealthy, it's possible that at least one
connection is CONNECTING at every moment which causes RR to stay in
CONNECTING. It's better to keep the TRANSIENT_FAILURE state in that
case so that fail-fast RPCs can fail fast.

The same changes have been made for RoundRobinLoadBalancer in #6657
This commit is contained in:
Kun Zhang 2021-01-15 15:19:52 -08:00 committed by GitHub
parent 9016cf55d7
commit 23d279660c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 3 deletions

View File

@ -203,9 +203,20 @@ final class GrpclbState {
if (config.getMode() == Mode.ROUND_ROBIN && newState.getState() == IDLE) {
subchannel.requestConnection();
}
subchannel.getAttributes().get(STATE_INFO).set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
AtomicReference<ConnectivityStateInfo> stateInfoRef =
subchannel.getAttributes().get(STATE_INFO);
// If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at
// every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE
// state in that case so that fail-fast RPCs can fail fast.
boolean keepState =
config.getMode() == Mode.ROUND_ROBIN
&& stateInfoRef.get().getState() == TRANSIENT_FAILURE
&& (newState.getState() == CONNECTING || newState.getState() == IDLE);
if (!keepState) {
stateInfoRef.set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
}
}
/**

View File

@ -1104,6 +1104,49 @@ public class GrpclbLoadBalancerTest {
verify(subchannelPool).clear();
}
@Test
public void roundRobinMode_subchannelStayTransientFailureUntilReady() {
InOrder inOrder = inOrder(helper);
List<EquivalentAddressGroup> grpclbBalancerList = createResolvedBalancerAddresses(1);
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
StreamObserver<LoadBalanceResponse> lbResponseObserver = lbResponseObserverCaptor.getValue();
// Simulate receiving LB response
List<ServerEntry> backends1 = Arrays.asList(
new ServerEntry("127.0.0.1", 2000, "token0001"),
new ServerEntry("127.0.0.1", 2010, "token0002"));
lbResponseObserver.onNext(buildInitialResponse());
lbResponseObserver.onNext(buildLbResponse(backends1));
assertEquals(2, mockSubchannels.size());
Subchannel subchannel1 = mockSubchannels.poll();
Subchannel subchannel2 = mockSubchannels.poll();
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class));
// Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error1");
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error));
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new ErrorEntry(error));
// Switch subchannel1 to IDLE, then to CONNECTING, which are ignored since the previous
// subchannel state is TRANSIENT_FAILURE. General state is unchanged.
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(IDLE));
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verifyNoMoreInteractions();
// Switch subchannel1 to READY, which will affect the general state
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(READY));
inOrder.verify(helper).updateBalancingState(eq(READY), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new BackendEntry(subchannel1, getLoadRecorder(), "token0001"));
inOrder.verifyNoMoreInteractions();
}
@Test
public void grpclbFallback_initialTimeout_serverListReceivedBeforeTimerExpires() {
subtestGrpclbFallbackInitialTimeout(false);