diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java index be8835b7c0b2d3..6ab6b4258d2cbd 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java @@ -23,6 +23,7 @@ public class CircuitBreakerFactory { public static final ImmutableSet> DEFAULT_IGNORED_ERRORS = ImmutableSet.of(CacheNotFoundException.class); + public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100; private CircuitBreakerFactory() {} @@ -37,7 +38,7 @@ private CircuitBreakerFactory() {} public static Retrier.CircuitBreaker createCircuitBreaker(final RemoteOptions remoteOptions) { if (remoteOptions.circuitBreakerStrategy == RemoteOptions.CircuitBreakerStrategy.FAILURE) { return new FailureCircuitBreaker( - remoteOptions.remoteFailureThreshold, + remoteOptions.remoteFailureRateThreshold, (int) remoteOptions.remoteFailureWindowInterval.toMillis()); } return Retrier.ALLOW_ALL_CALLS; diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java index b1b5739fd44c96..2baeba4ed07f3d 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java @@ -21,34 +21,38 @@ import java.util.concurrent.atomic.AtomicInteger; /** - * The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents - * further calls to a remote cache once the number of failures within a given window exceeds a - * specified threshold for a build. In the context of Bazel, a new instance of {@link - * Retrier.CircuitBreaker} is created for each build. Therefore, if the circuit breaker trips during - * a build, the remote cache will be disabled for that build. However, it will be enabled again for - * the next build as a new instance of {@link Retrier.CircuitBreaker} will be created. + * The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents further calls to + * a remote cache once the failures rate within a given window exceeds a specified threshold for a build. + * In the context of Bazel, a new instance of {@link Retrier.CircuitBreaker} is created for each build. Therefore, if + * the circuit breaker trips during a build, the remote cache will be disabled for that build. However, it will be + * enabled again for the next build as a new instance of {@link Retrier.CircuitBreaker} will be created. */ public class FailureCircuitBreaker implements Retrier.CircuitBreaker { private State state; + private final AtomicInteger successes; private final AtomicInteger failures; - private final int failureThreshold; + private final AtomicInteger ignoredFailures; + private final int failureRateThreshold; private final int slidingWindowSize; + private final int minCallCountToComputeFailureRate; private final ScheduledExecutorService scheduledExecutor; private final ImmutableSet> ignoredErrors; /** * Creates a {@link FailureCircuitBreaker}. * - * @param failureThreshold is used to set the number of failures required to trip the circuit - * breaker in given time window. - * @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number - * of failures. + * @param failureRateThreshold is used to set the min percentage of failure required to trip the circuit breaker in + * given time window. + * @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number of failures. */ - public FailureCircuitBreaker(int failureThreshold, int slidingWindowSize) { - this.failureThreshold = failureThreshold; + public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) { this.failures = new AtomicInteger(0); + this.successes = new AtomicInteger(0); + this.ignoredFailures = new AtomicInteger(0); + this.failureRateThreshold = failureRateThreshold; this.slidingWindowSize = slidingWindowSize; + this.minCallCountToComputeFailureRate = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; this.state = State.ACCEPT_CALLS; this.scheduledExecutor = slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null; @@ -64,20 +68,36 @@ public State state() { public void recordFailure(Exception e) { if (!ignoredErrors.contains(e.getClass())) { int failureCount = failures.incrementAndGet(); + int totalCallCount = successes.get() + failureCount + ignoredFailures.get(); if (slidingWindowSize > 0) { var unused = scheduledExecutor.schedule( failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); } + + if (totalCallCount < minCallCountToComputeFailureRate) { + // The remote call count is below the threshold required to calculate the failure rate. + return; + } + double failureRate = (failureCount * 100.0) / totalCallCount; + // Since the state can only be changed to the open state, synchronization is not required. - if (failureCount > this.failureThreshold) { + if (failureRate > this.failureRateThreshold) { this.state = State.REJECT_CALLS; } + } else { + ignoredFailures.incrementAndGet(); + if (slidingWindowSize > 0) { + scheduledExecutor.schedule(ignoredFailures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); + } } } @Override public void recordSuccess() { - // do nothing, implement if we need to set threshold on failure rate instead of count. + successes.incrementAndGet(); + if (slidingWindowSize > 0) { + scheduledExecutor.schedule(successes::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); + } } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java index 61dd3f8749c3e5..793eccdfa51756 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java +++ b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java @@ -695,15 +695,16 @@ public RemoteOutputsStrategyConverter() { public CircuitBreakerStrategy circuitBreakerStrategy; @Option( - name = "experimental_remote_failure_threshold", - defaultValue = "100", + name = "experimental_remote_failure_rate_threshold", + defaultValue = "10", documentationCategory = OptionDocumentationCategory.REMOTE, effectTags = {OptionEffectTag.EXECUTION}, + converter = Converters.PercentageConverter.class, help = - "Sets the allowed number of failures in a specific time window after which it stops" - + " calling to the remote cache/executor. By default the value is 100. Setting this" - + " to 0 or negative means no limitation.") - public int remoteFailureThreshold; + "Sets the allowed number of failure rate in percentage for a specific time window after which it stops " + + "calling to the remote cache/executor. By default the value is 10. Setting this to 0 means no " + + "limitation.") + public int remoteFailureRateThreshold; @Option( name = "experimental_remote_failure_window_interval", @@ -712,7 +713,7 @@ public RemoteOutputsStrategyConverter() { effectTags = {OptionEffectTag.EXECUTION}, converter = RemoteDurationConverter.class, help = - "The interval in which the failure count of the remote requests are computed. On zero or" + "The interval in which the failure rate of the remote requests are computed. On zero or" + " negative value the failure duration is computed the whole duration of the" + " execution.Following units can be used: Days (d), hours (h), minutes (m), seconds" + " (s), and milliseconds (ms). If the unit is omitted, the value is interpreted as" diff --git a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java index 2d00a8e0e816ab..502c49ddff6225 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java @@ -18,46 +18,46 @@ import build.bazel.remote.execution.v2.Digest; import com.google.devtools.build.lib.remote.Retrier.CircuitBreaker.State; import com.google.devtools.build.lib.remote.common.CacheNotFoundException; + import java.util.ArrayList; import java.util.Collections; import java.util.List; + import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import java.util.stream.IntStream; + @RunWith(JUnit4.class) public class FailureCircuitBreakerTest { @Test - public void testRecordFailure() throws InterruptedException { - final int failureThreshold = 10; + public void testRecordFailure_withIgnoredErrors() throws InterruptedException { + final int failureRateThreshold = 10; final int windowInterval = 100; FailureCircuitBreaker failureCircuitBreaker = - new FailureCircuitBreaker(failureThreshold, windowInterval); + new FailureCircuitBreaker(failureRateThreshold, windowInterval); List listOfExceptionThrownOnFailure = new ArrayList<>(); - for (int index = 0; index < failureThreshold; index++) { + for (int index = 0; index < failureRateThreshold; index++) { listOfExceptionThrownOnFailure.add(new Exception()); } - for (int index = 0; index < failureThreshold * 9; index++) { + for (int index = 0; index < failureRateThreshold * 9; index++) { listOfExceptionThrownOnFailure.add(new CacheNotFoundException(Digest.newBuilder().build())); } Collections.shuffle(listOfExceptionThrownOnFailure); // make calls equals to threshold number of not ignored failure calls in parallel. - listOfExceptionThrownOnFailure.stream() - .parallel() - .forEach(failureCircuitBreaker::recordFailure); + listOfExceptionThrownOnFailure.stream().parallel().forEach(failureCircuitBreaker::recordFailure); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); // Sleep for windowInterval + 1ms. Thread.sleep(windowInterval + 1 /*to compensate any delay*/); // make calls equals to threshold number of not ignored failure calls in parallel. - listOfExceptionThrownOnFailure.stream() - .parallel() - .forEach(failureCircuitBreaker::recordFailure); + listOfExceptionThrownOnFailure.stream().parallel().forEach(failureCircuitBreaker::recordFailure); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); // Sleep for less than windowInterval. @@ -65,4 +65,24 @@ public void testRecordFailure() throws InterruptedException { failureCircuitBreaker.recordFailure(new Exception()); assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS); } + + @Test + public void testRecordFailure_minCallCriteriaNotMet() throws InterruptedException { + final int failureRateThreshold = 10; + final int windowInterval = 100; + final int minCallToComputeFailure = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; + FailureCircuitBreaker failureCircuitBreaker = + new FailureCircuitBreaker(failureRateThreshold, windowInterval); + + // make half failure call, half success call and number of total call less than minCallToComputeFailure. + IntStream.range(0, minCallToComputeFailure >> 1).parallel() + .forEach(i -> failureCircuitBreaker.recordFailure(new Exception())); + IntStream.range(0, minCallToComputeFailure >> 1).parallel().forEach(i -> failureCircuitBreaker.recordSuccess()); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); + + // Sleep for less than windowInterval. + Thread.sleep(windowInterval - 20); + failureCircuitBreaker.recordFailure(new Exception()); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS); + } }