From 414bd11987ed75dd333cc959bb43a88f29df3c0e Mon Sep 17 00:00:00 2001 From: Anshuman Mishra Date: Wed, 7 Jun 2023 14:10:49 -0700 Subject: [PATCH] Use failure_rate instead of failure count for circuit breaker Continuation of #18359 I ran multiple experiment and tried to find optimal failure threshold and failure window interval with different remote_timeout, for healthy remote cache, semi-healthy (overloaded) remote cache and unhealthy remote cache. As I described [here](https://github.com/bazelbuild/bazel/pull/18359#issuecomment-1542589266) even with healthy remote cache there was 5-10% circuit trip and we were not getting the best result. Issue related to the failure count: 1. When the remote cache is healthy, builds are fast, and Bazel makes a high number of calls to the buildfarm. As a result, even with a moderate failure rate, the failure count may exceed the threshold. 2. Additionally, write calls, which have a higher probability of failure compared to other calls, are batched immediately after the completion of an action's build. This further increases the chances of breaching the failure threshold within the defined window interval. 3. On the other hand, when the remote cache is unhealthy or semi-healthy, builds are significantly slowed down, and Bazel makes fewer calls to the remote cache. Finding a configuration that works well for both healthy and unhealthy remote caches was not feasible. Therefore, changed the approach to use the failure rate, and easily found a configuration that worked effectively in both scenarios. Closes #18539. PiperOrigin-RevId: 538588379 Change-Id: I64a49eeeb32846d41d54ca3b637ded3085588528 --- .../build/lib/remote/RemoteRetrier.java | 2 +- .../devtools/build/lib/remote/Retrier.java | 23 ++++--- .../circuitbreaker/CircuitBreakerFactory.java | 24 +------ .../circuitbreaker/FailureCircuitBreaker.java | 61 ++++++++++------- .../lib/remote/options/RemoteOptions.java | 15 +++-- .../build/lib/remote/RetrierTest.java | 13 ++-- .../FailureCircuitBreakerTest.java | 67 ++++++++++++------- 7 files changed, 109 insertions(+), 96 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java index 1c09af47249324..403af13d55a7cc 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java @@ -35,7 +35,7 @@ public class RemoteRetrier extends Retrier { @Nullable - public static Status fromException(Exception e) { + private static Status fromException(Exception e) { for (Throwable cause = e; cause != null; cause = cause.getCause()) { if (cause instanceof StatusRuntimeException) { return ((StatusRuntimeException) cause).getStatus(); diff --git a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java index 457880268764d5..74b51987318eac 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java +++ b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java @@ -100,7 +100,7 @@ enum State { State state(); /** Called after an execution failed. */ - void recordFailure(Exception e); + void recordFailure(); /** Called after an execution succeeded. */ void recordSuccess(); @@ -130,7 +130,7 @@ public State state() { } @Override - public void recordFailure(Exception e) {} + public void recordFailure() {} @Override public void recordSuccess() {} @@ -245,12 +245,14 @@ public T execute(Callable call, Backoff backoff) throws Exception { circuitBreaker.recordSuccess(); return r; } catch (Exception e) { - circuitBreaker.recordFailure(e); Throwables.throwIfInstanceOf(e, InterruptedException.class); - if (State.TRIAL_CALL.equals(circuitState)) { + if (!shouldRetry.test(e)) { + // A non-retriable error doesn't represent server failure. + circuitBreaker.recordSuccess(); throw e; } - if (!shouldRetry.test(e)) { + circuitBreaker.recordFailure(); + if (State.TRIAL_CALL.equals(circuitState)) { throw e; } final long delayMillis = backoff.nextDelayMillis(e); @@ -297,11 +299,11 @@ public ListenableFuture executeAsync(AsyncCallable call, Backoff backo private ListenableFuture onExecuteAsyncFailure( Exception t, AsyncCallable call, Backoff backoff, State circuitState) { - circuitBreaker.recordFailure(t); - if (circuitState.equals(State.TRIAL_CALL)) { - return Futures.immediateFailedFuture(t); - } if (isRetriable(t)) { + circuitBreaker.recordFailure(); + if (circuitState.equals(State.TRIAL_CALL)) { + return Futures.immediateFailedFuture(t); + } long waitMillis = backoff.nextDelayMillis(t); if (waitMillis >= 0) { try { @@ -315,6 +317,9 @@ private ListenableFuture onExecuteAsyncFailure( return Futures.immediateFailedFuture(t); } } else { + // gRPC Errors NOT_FOUND, OUT_OF_RANGE, ALREADY_EXISTS etc. are non-retriable error, and they don't represent an + // issue in Server. So treating these errors as successful api call. + circuitBreaker.recordSuccess(); return Futures.immediateFailedFuture(t); } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java index 94002a94c26241..7781440880c885 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java @@ -14,31 +14,11 @@ package com.google.devtools.build.lib.remote.circuitbreaker; import com.google.devtools.build.lib.remote.Retrier; -import com.google.devtools.build.lib.remote.common.CacheNotFoundException; import com.google.devtools.build.lib.remote.options.RemoteOptions; -import io.grpc.Status; -import java.util.function.Predicate; - -import static com.google.devtools.build.lib.remote.RemoteRetrier.fromException; - /** Factory for {@link Retrier.CircuitBreaker} */ public class CircuitBreakerFactory { - public static final Predicate DEFAULT_IGNORED_ERRORS = - e -> { - Status s = fromException(e); - if (s == null) { - return e.getClass() == CacheNotFoundException.class; - } - switch (s.getCode()) { - case NOT_FOUND: - case OUT_OF_RANGE: - System.out.println("out of range"); - return true; - default: - return false; - } - }; + public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100; private CircuitBreakerFactory() {} @@ -53,7 +33,7 @@ private CircuitBreakerFactory() {} public static Retrier.CircuitBreaker createCircuitBreaker(final RemoteOptions remoteOptions) { if (remoteOptions.circuitBreakerStrategy == RemoteOptions.CircuitBreakerStrategy.FAILURE) { return new FailureCircuitBreaker( - remoteOptions.remoteFailureThreshold, + remoteOptions.remoteFailureRateThreshold, (int) remoteOptions.remoteFailureWindowInterval.toMillis()); } return Retrier.ALLOW_ALL_CALLS; diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java index 8630337ac9af7a..363d32d12007b4 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java @@ -18,41 +18,43 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Predicate; /** * The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents - * further calls to a remote cache once the number of failures within a given window exceeds a - * specified threshold for a build. In the context of Bazel, a new instance of {@link - * Retrier.CircuitBreaker} is created for each build. Therefore, if the circuit breaker trips during - * a build, the remote cache will be disabled for that build. However, it will be enabled again for - * the next build as a new instance of {@link Retrier.CircuitBreaker} will be created. + * further calls to a remote cache once the failures rate within a given window exceeds a specified + * threshold for a build. In the context of Bazel, a new instance of {@link Retrier.CircuitBreaker} + * is created for each build. Therefore, if the circuit breaker trips during a build, the remote + * cache will be disabled for that build. However, it will be enabled again for the next build as a + * new instance of {@link Retrier.CircuitBreaker} will be created. */ public class FailureCircuitBreaker implements Retrier.CircuitBreaker { private State state; + private final AtomicInteger successes; private final AtomicInteger failures; - private final int failureThreshold; + private final int failureRateThreshold; private final int slidingWindowSize; + private final int minCallCountToComputeFailureRate; private final ScheduledExecutorService scheduledExecutor; - private final Predicate ignoredErrors; /** * Creates a {@link FailureCircuitBreaker}. * - * @param failureThreshold is used to set the number of failures required to trip the circuit - * breaker in given time window. + * @param failureRateThreshold is used to set the min percentage of failure required to trip the + * circuit breaker in given time window. * @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number * of failures. */ - public FailureCircuitBreaker(int failureThreshold, int slidingWindowSize) { - this.failureThreshold = failureThreshold; + public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) { this.failures = new AtomicInteger(0); + this.successes = new AtomicInteger(0); + this.failureRateThreshold = failureRateThreshold; this.slidingWindowSize = slidingWindowSize; + this.minCallCountToComputeFailureRate = + CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; this.state = State.ACCEPT_CALLS; this.scheduledExecutor = slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null; - this.ignoredErrors = CircuitBreakerFactory.DEFAULT_IGNORED_ERRORS; } @Override @@ -61,23 +63,30 @@ public State state() { } @Override - public void recordFailure(Exception e) { - if (!ignoredErrors.test(e)) { - int failureCount = failures.incrementAndGet(); - if (slidingWindowSize > 0) { - var unused = - scheduledExecutor.schedule( - failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); - } - // Since the state can only be changed to the open state, synchronization is not required. - if (failureCount > this.failureThreshold) { - this.state = State.REJECT_CALLS; - } + public void recordFailure() { + int failureCount = failures.incrementAndGet(); + int totalCallCount = successes.get() + failureCount; + if (slidingWindowSize > 0) { + var unused = scheduledExecutor.schedule(failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); + } + + if (totalCallCount < minCallCountToComputeFailureRate) { + // The remote call count is below the threshold required to calculate the failure rate. + return; + } + double failureRate = (failureCount * 100.0) / totalCallCount; + + // Since the state can only be changed to the open state, synchronization is not required. + if (failureRate > this.failureRateThreshold) { + this.state = State.REJECT_CALLS; } } @Override public void recordSuccess() { - // do nothing, implement if we need to set threshold on failure rate instead of count. + successes.incrementAndGet(); + if (slidingWindowSize > 0) { + var unused = scheduledExecutor.schedule(successes::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); + } } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java index da725faec3e72f..f8a18361bb658e 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java +++ b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java @@ -666,15 +666,16 @@ public RemoteOutputsStrategyConverter() { public CircuitBreakerStrategy circuitBreakerStrategy; @Option( - name = "experimental_remote_failure_threshold", - defaultValue = "100", + name = "experimental_remote_failure_rate_threshold", + defaultValue = "10", documentationCategory = OptionDocumentationCategory.REMOTE, effectTags = {OptionEffectTag.EXECUTION}, + converter = Converters.PercentageConverter.class, help = - "Sets the allowed number of failures in a specific time window after which it stops" - + " calling to the remote cache/executor. By default the value is 100. Setting this" - + " to 0 or negative means no limitation.") - public int remoteFailureThreshold; + "Sets the allowed number of failure rate in percentage for a specific time window after" + + " which it stops calling to the remote cache/executor. By default the value is 10." + + " Setting this to 0 means no limitation.") + public int remoteFailureRateThreshold; @Option( name = "experimental_remote_failure_window_interval", @@ -683,7 +684,7 @@ public RemoteOutputsStrategyConverter() { effectTags = {OptionEffectTag.EXECUTION}, converter = RemoteDurationConverter.class, help = - "The interval in which the failure count of the remote requests are computed. On zero or" + "The interval in which the failure rate of the remote requests are computed. On zero or" + " negative value the failure duration is computed the whole duration of the" + " execution.Following units can be used: Days (d), hours (h), minutes (m), seconds" + " (s), and milliseconds (ms). If the unit is omitted, the value is interpreted as" diff --git a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java index 7c30e1bf6eddc3..edcae774e0436a 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java @@ -38,6 +38,9 @@ import java.util.function.Predicate; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; + +import io.grpc.Status; +import io.grpc.StatusRuntimeException; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -94,7 +97,7 @@ public void retryShouldWork_failure() throws Exception { assertThat(e).hasMessageThat().isEqualTo("call failed"); assertThat(numCalls.get()).isEqualTo(3); - verify(alwaysOpen, times(3)).recordFailure(any(Exception.class)); + verify(alwaysOpen, times(3)).recordFailure(); verify(alwaysOpen, never()).recordSuccess(); } @@ -118,8 +121,8 @@ public void retryShouldWorkNoRetries_failure() throws Exception { assertThat(e).hasMessageThat().isEqualTo("call failed"); assertThat(numCalls.get()).isEqualTo(1); - verify(alwaysOpen, times(1)).recordFailure(e); - verify(alwaysOpen, never()).recordSuccess(); + verify(alwaysOpen, never()).recordFailure(); + verify(alwaysOpen, times(1)).recordSuccess(); } @Test @@ -139,7 +142,7 @@ public void retryShouldWork_success() throws Exception { }); assertThat(val).isEqualTo(1); - verify(alwaysOpen, times(2)).recordFailure(any(Exception.class)); + verify(alwaysOpen, times(2)).recordFailure(); verify(alwaysOpen, times(1)).recordSuccess(); } @@ -351,7 +354,7 @@ public synchronized State state() { } @Override - public synchronized void recordFailure(Exception e) { + public synchronized void recordFailure() { consecutiveFailures++; if (consecutiveFailures >= maxConsecutiveFailures) { state = State.REJECT_CALLS; diff --git a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java index c4ea47121959ee..a065bc14603c1f 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java @@ -15,17 +15,12 @@ import static com.google.common.truth.Truth.assertThat; -import build.bazel.remote.execution.v2.Digest; import com.google.devtools.build.lib.remote.Retrier.CircuitBreaker.State; -import com.google.devtools.build.lib.remote.common.CacheNotFoundException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Random; -import io.grpc.Status; -import io.grpc.StatusRuntimeException; +import java.util.stream.IntStream; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -33,47 +28,67 @@ @RunWith(JUnit4.class) public class FailureCircuitBreakerTest { - private static final List IGNORED_ERRORS = Arrays.asList( - new CacheNotFoundException(Digest.newBuilder().build()), - new StatusRuntimeException(Status.OUT_OF_RANGE) - ); - @Test - public void testRecordFailure() throws InterruptedException { - final int failureThreshold = 10; + public void testRecordFailure_circuitTrips() throws InterruptedException { + final int failureRateThreshold = 10; final int windowInterval = 100; FailureCircuitBreaker failureCircuitBreaker = - new FailureCircuitBreaker(failureThreshold, windowInterval); + new FailureCircuitBreaker(failureRateThreshold, windowInterval); - List listOfExceptionThrownOnFailure = new ArrayList<>(); - for (int index = 0; index < failureThreshold; index++) { - listOfExceptionThrownOnFailure.add(new Exception()); + List listOfSuccessAndFailureCalls = new ArrayList<>(); + for (int index = 0; index < failureRateThreshold; index++) { + listOfSuccessAndFailureCalls.add(failureCircuitBreaker::recordFailure); } - Random rand = new Random(); - for (int index = 0; index < failureThreshold * 9; index++) { - listOfExceptionThrownOnFailure.add(IGNORED_ERRORS.get(rand.nextInt(IGNORED_ERRORS.size()))); + + for (int index = 0; index < failureRateThreshold * 9; index++) { + listOfSuccessAndFailureCalls.add(failureCircuitBreaker::recordSuccess); } - Collections.shuffle(listOfExceptionThrownOnFailure); + Collections.shuffle(listOfSuccessAndFailureCalls); // make calls equals to threshold number of not ignored failure calls in parallel. - listOfExceptionThrownOnFailure.stream() + listOfSuccessAndFailureCalls.stream() .parallel() - .forEach(failureCircuitBreaker::recordFailure); + .forEach(Runnable::run); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); // Sleep for windowInterval + 1ms. Thread.sleep(windowInterval + 1 /*to compensate any delay*/); // make calls equals to threshold number of not ignored failure calls in parallel. - listOfExceptionThrownOnFailure.stream() + listOfSuccessAndFailureCalls.stream() .parallel() - .forEach(failureCircuitBreaker::recordFailure); + .forEach(Runnable::run); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); // Sleep for less than windowInterval. Thread.sleep(windowInterval - 5); - failureCircuitBreaker.recordFailure(new Exception()); + failureCircuitBreaker.recordFailure(); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS); + } + + @Test + public void testRecordFailure_minCallCriteriaNotMet() throws InterruptedException { + final int failureRateThreshold = 10; + final int windowInterval = 100; + final int minCallToComputeFailure = + CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; + FailureCircuitBreaker failureCircuitBreaker = + new FailureCircuitBreaker(failureRateThreshold, windowInterval); + + // make half failure call, half success call and number of total call less than + // minCallToComputeFailure. + IntStream.range(0, minCallToComputeFailure >> 1) + .parallel() + .forEach(i -> failureCircuitBreaker.recordFailure()); + IntStream.range(0, minCallToComputeFailure >> 1) + .parallel() + .forEach(i -> failureCircuitBreaker.recordSuccess()); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); + + // Sleep for less than windowInterval. + Thread.sleep(windowInterval - 20); + failureCircuitBreaker.recordFailure(); assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS); } }