Skip to content

Commit

Permalink
xds: priority reset failover timer when connecting if seen ready or i…
Browse files Browse the repository at this point in the history
…dle since TF (#9093) (#9098)

changes in priority:
Keep track of whether a child has seen TRANSIENT_FAILURE more recently than IDLE or READY, and use this to decide whether to restart the failover timer when a child reports CONNECTING. This ensures that we properly start the failover timer when the ring_hash child policy transitions from IDLE to CONNECTING at startup.
Behaviour change also affects address updates the current priority from CONNECTING to CONNECTING, previously it reports one CONNECTING, right now it does not report and wait there due to failover timer in effect. This helps to try the next priority.
  • Loading branch information
YifeiZhuang authored Apr 20, 2022
1 parent afc52a0 commit 74d831f
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 20 deletions.
46 changes: 26 additions & 20 deletions xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ private final class ChildLbState {
final GracefulSwitchLoadBalancer lb;
// Timer to fail over to the next priority if not connected in 10 sec. Scheduled only once at
// child initialization.
final ScheduledHandle failOverTimer;
ScheduledHandle failOverTimer;
boolean seenReadyOrIdleSinceTransientFailure = false;
// Timer to delay shutdown and deletion of the priority. Scheduled whenever the child is
// deactivated.
@Nullable ScheduledHandle deletionTimer;
Expand All @@ -200,24 +201,23 @@ private final class ChildLbState {
this.priority = priority;
childHelper = new ChildHelper(ignoreReresolution);
lb = new GracefulSwitchLoadBalancer(childHelper);
failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS, executor);
logger.log(XdsLogLevel.DEBUG, "Priority created: {0}", priority);
}

class FailOverTask implements Runnable {
@Override
public void run() {
if (deletionTimer != null && deletionTimer.isPending()) {
// The child is deactivated.
return;
}
picker = new ErrorPicker(
Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority));
logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority);
currentPriority = null; // reset currentPriority to guarantee failover happen
tryNextPriority(true);
final class FailOverTask implements Runnable {
@Override
public void run() {
if (deletionTimer != null && deletionTimer.isPending()) {
// The child is deactivated.
return;
}
picker = new ErrorPicker(
Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority));
logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority);
currentPriority = null; // reset currentPriority to guarantee failover happen
tryNextPriority(true);
}

failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS, executor);
logger.log(XdsLogLevel.DEBUG, "Priority created: {0}", priority);
}

/**
Expand Down Expand Up @@ -312,11 +312,17 @@ public void run() {
if (deletionTimer != null && deletionTimer.isPending()) {
return;
}
if (failOverTimer.isPending()) {
if (newState.equals(READY) || newState.equals(IDLE)
|| newState.equals(TRANSIENT_FAILURE)) {
failOverTimer.cancel();
if (newState.equals(CONNECTING) ) {
if (!failOverTimer.isPending() && seenReadyOrIdleSinceTransientFailure) {
failOverTimer = syncContext.schedule(new FailOverTask(), 10, TimeUnit.SECONDS,
executor);
}
} else if (newState.equals(READY) || newState.equals(IDLE)) {
seenReadyOrIdleSinceTransientFailure = true;
failOverTimer.cancel();
} else if (newState.equals(TRANSIENT_FAILURE)) {
seenReadyOrIdleSinceTransientFailure = false;
failOverTimer.cancel();
}
tryNextPriority(true);
}
Expand Down
38 changes: 38 additions & 0 deletions xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,44 @@ public void idleToConnectingDoesNotTriggerFailOver() {
assertThat(fooHelpers).hasSize(1);
}

@Test
public void connectingResetFailOverIfSeenReadyOrIdleSinceTransientFailure() {
PriorityChildConfig priorityChildConfig0 =
new PriorityChildConfig(new PolicySelection(fooLbProvider, new Object()), true);
PriorityChildConfig priorityChildConfig1 =
new PriorityChildConfig(new PolicySelection(fooLbProvider, new Object()), true);
PriorityLbConfig priorityLbConfig =
new PriorityLbConfig(
ImmutableMap.of("p0", priorityChildConfig0, "p1", priorityChildConfig1),
ImmutableList.of("p0", "p1"));
priorityLb.handleResolvedAddresses(
ResolvedAddresses.newBuilder()
.setAddresses(ImmutableList.<EquivalentAddressGroup>of())
.setLoadBalancingPolicyConfig(priorityLbConfig)
.build());
assertThat(fooBalancers).hasSize(1);
assertThat(fooHelpers).hasSize(1);
Helper helper0 = Iterables.getOnlyElement(fooHelpers);

// p0 gets IDLE.
helper0.updateBalancingState(
IDLE,
BUFFER_PICKER);
assertCurrentPickerIsBufferPicker();

// p0 goes to CONNECTING, reset failover timer
fakeClock.forwardTime(5, TimeUnit.SECONDS);
helper0.updateBalancingState(
CONNECTING,
BUFFER_PICKER);
verify(helper, times(2)).updateBalancingState(eq(CONNECTING), eq(BUFFER_PICKER));

// failover happens
fakeClock.forwardTime(10, TimeUnit.SECONDS);
assertThat(fooBalancers).hasSize(2);
assertThat(fooHelpers).hasSize(2);
}

@Test
public void readyToConnectDoesNotFailOverButUpdatesPicker() {
PriorityChildConfig priorityChildConfig0 =
Expand Down

0 comments on commit 74d831f

Please sign in to comment.