From 74d831f72f7f41c38782bf8e3d609b0bb8a8ae10 Mon Sep 17 00:00:00 2001 From: yifeizhuang Date: Wed, 20 Apr 2022 11:21:40 -0700 Subject: [PATCH] xds: priority reset failover timer when connecting if seen ready or idle since TF (#9093) (#9098) changes in priority: Keep track of whether a child has seen TRANSIENT_FAILURE more recently than IDLE or READY, and use this to decide whether to restart the failover timer when a child reports CONNECTING. This ensures that we properly start the failover timer when the ring_hash child policy transitions from IDLE to CONNECTING at startup. Behaviour change also affects address updates the current priority from CONNECTING to CONNECTING, previously it reports one CONNECTING, right now it does not report and wait there due to failover timer in effect. This helps to try the next priority. --- .../io/grpc/xds/PriorityLoadBalancer.java | 46 +++++++++++-------- .../io/grpc/xds/PriorityLoadBalancerTest.java | 38 +++++++++++++++ 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java b/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java index 0678ffc34b6..63855a28dd4 100644 --- a/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java +++ b/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java @@ -188,7 +188,8 @@ private final class ChildLbState { final GracefulSwitchLoadBalancer lb; // Timer to fail over to the next priority if not connected in 10 sec. Scheduled only once at // child initialization. - final ScheduledHandle failOverTimer; + ScheduledHandle failOverTimer; + boolean seenReadyOrIdleSinceTransientFailure = false; // Timer to delay shutdown and deletion of the priority. Scheduled whenever the child is // deactivated. @Nullable ScheduledHandle deletionTimer; @@ -200,24 +201,23 @@ private final class ChildLbState { this.priority = priority; childHelper = new ChildHelper(ignoreReresolution); lb = new GracefulSwitchLoadBalancer(childHelper); + failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS, executor); + logger.log(XdsLogLevel.DEBUG, "Priority created: {0}", priority); + } - class FailOverTask implements Runnable { - @Override - public void run() { - if (deletionTimer != null && deletionTimer.isPending()) { - // The child is deactivated. - return; - } - picker = new ErrorPicker( - Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority)); - logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority); - currentPriority = null; // reset currentPriority to guarantee failover happen - tryNextPriority(true); + final class FailOverTask implements Runnable { + @Override + public void run() { + if (deletionTimer != null && deletionTimer.isPending()) { + // The child is deactivated. + return; } + picker = new ErrorPicker( + Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority)); + logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority); + currentPriority = null; // reset currentPriority to guarantee failover happen + tryNextPriority(true); } - - failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS, executor); - logger.log(XdsLogLevel.DEBUG, "Priority created: {0}", priority); } /** @@ -312,11 +312,17 @@ public void run() { if (deletionTimer != null && deletionTimer.isPending()) { return; } - if (failOverTimer.isPending()) { - if (newState.equals(READY) || newState.equals(IDLE) - || newState.equals(TRANSIENT_FAILURE)) { - failOverTimer.cancel(); + if (newState.equals(CONNECTING) ) { + if (!failOverTimer.isPending() && seenReadyOrIdleSinceTransientFailure) { + failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS, + executor); } + } else if (newState.equals(READY) || newState.equals(IDLE)) { + seenReadyOrIdleSinceTransientFailure = true; + failOverTimer.cancel(); + } else if (newState.equals(TRANSIENT_FAILURE)) { + seenReadyOrIdleSinceTransientFailure = false; + failOverTimer.cancel(); } tryNextPriority(true); } diff --git a/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java b/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java index 9a23770512c..47888a7980e 100644 --- a/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java +++ b/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java @@ -435,6 +435,44 @@ public void idleToConnectingDoesNotTriggerFailOver() { assertThat(fooHelpers).hasSize(1); } + @Test + public void connectingResetFailOverIfSeenReadyOrIdleSinceTransientFailure() { + PriorityChildConfig priorityChildConfig0 = + new PriorityChildConfig(new PolicySelection(fooLbProvider, new Object()), true); + PriorityChildConfig priorityChildConfig1 = + new PriorityChildConfig(new PolicySelection(fooLbProvider, new Object()), true); + PriorityLbConfig priorityLbConfig = + new PriorityLbConfig( + ImmutableMap.of("p0", priorityChildConfig0, "p1", priorityChildConfig1), + ImmutableList.of("p0", "p1")); + priorityLb.handleResolvedAddresses( + ResolvedAddresses.newBuilder() + .setAddresses(ImmutableList.of()) + .setLoadBalancingPolicyConfig(priorityLbConfig) + .build()); + assertThat(fooBalancers).hasSize(1); + assertThat(fooHelpers).hasSize(1); + Helper helper0 = Iterables.getOnlyElement(fooHelpers); + + // p0 gets IDLE. + helper0.updateBalancingState( + IDLE, + BUFFER_PICKER); + assertCurrentPickerIsBufferPicker(); + + // p0 goes to CONNECTING, reset failover timer + fakeClock.forwardTime(5, TimeUnit.SECONDS); + helper0.updateBalancingState( + CONNECTING, + BUFFER_PICKER); + verify(helper, times(2)).updateBalancingState(eq(CONNECTING), eq(BUFFER_PICKER)); + + // failover happens + fakeClock.forwardTime(10, TimeUnit.SECONDS); + assertThat(fooBalancers).hasSize(2); + assertThat(fooHelpers).hasSize(2); + } + @Test public void readyToConnectDoesNotFailOverButUpdatesPicker() { PriorityChildConfig priorityChildConfig0 =