From 8ca4a5d657f04d5467d42334d5ed7c9efac2b9ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 22 Sep 2023 10:05:15 +0200 Subject: [PATCH] SOLR-16968: The MemoryCircuitBreaker now uses average heap usage (#1905) --- solr/CHANGES.txt | 2 + .../java/org/apache/solr/core/SolrCore.java | 17 +++- .../AveragingMetricProvider.java | 80 +++++++++++++++++++ .../util/circuitbreaker/CircuitBreaker.java | 9 ++- .../circuitbreaker/CircuitBreakerManager.java | 14 ++++ .../CircuitBreakerRegistry.java | 75 ++++++++++++++--- .../circuitbreaker/MemoryCircuitBreaker.java | 70 +++++++++++----- .../solr/util/BaseTestCircuitBreaker.java | 31 ++++--- 8 files changed, 253 insertions(+), 45 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 70a93e99ab6..18114626a2b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -129,6 +129,8 @@ Improvements * SOLR-16970: SOLR_OPTS is now able to override options set by the Solr control scripts, "bin/solr" and "bin/solr.cmd". (Houston Putman) +* SOLR-16968: The MemoryCircuitBreaker now uses average heap usage over the last 30 seconds (janhoy, Christine Poerschke) + * SOLR-14886: Suppress stack traces in query response (Isabelle Giguere via Alex Deparvu) * SOLR-16461: `/solr/coreName/replication?command=backup` now has a v2 equivalent, available at diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java index 944d4684016..f5e28b34aa0 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrCore.java +++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java @@ -1088,9 +1088,6 @@ private SolrCore( solrMetricsContext = coreMetricManager.getSolrMetricsContext(); this.coreMetricManager.loadReporters(); - // init pluggable circuit breakers - initPlugins(null, CircuitBreaker.class); - if (updateHandler == null) { directoryFactory = initDirectoryFactory(); recoveryStrategyBuilder = initRecoveryStrategyBuilder(); @@ -1115,6 +1112,9 @@ private SolrCore( // initialize core metrics initializeMetrics(solrMetricsContext, null); + // init pluggable circuit breakers, after metrics because some circuit breakers use metrics + initPlugins(null, CircuitBreaker.class); + SolrFieldCacheBean solrFieldCacheBean = new SolrFieldCacheBean(); // this is registered at the CONTAINER level because it's not core-specific - for now we // also register it here for back-compat @@ -1764,6 +1764,17 @@ private void doClose() { ExecutorUtil.shutdownAndAwaitTermination(coreAsyncTaskExecutor); + // Close circuit breakers that may have background threads, before metrics because some circuit + // breakers use metrics + try { + getCircuitBreakerRegistry().close(); + } catch (Throwable e) { + log.error("Exception closing circuit breakers", e); + if (e instanceof Error) { + throw (Error) e; + } + } + // stop reporting metrics try { coreMetricManager.close(); diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java new file mode 100644 index 00000000000..60161e98181 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.util.circuitbreaker; + +import com.google.common.util.concurrent.AtomicDouble; +import java.io.Closeable; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.solr.common.util.ExecutorUtil; +import org.apache.solr.common.util.SolrNamedThreadFactory; +import org.apache.solr.logging.CircularList; + +/** Averages the metric value over a period of time */ +public class AveragingMetricProvider implements Closeable { + private final CircularList samplesRingBuffer; + private ScheduledExecutorService executor; + private final AtomicDouble currentAverageValue = new AtomicDouble(-1); + + /** + * Creates an instance with an executor that runs every sampleInterval seconds and averages over + * numSamples samples. + * + * @param metricProvider metric provider that will provide a value + * @param numSamples number of samples to calculate average for + * @param sampleInterval interval between each sample + */ + public AveragingMetricProvider( + MetricProvider metricProvider, int numSamples, long sampleInterval) { + this.samplesRingBuffer = new CircularList<>(numSamples); + executor = + Executors.newSingleThreadScheduledExecutor( + new SolrNamedThreadFactory( + "AveragingMetricProvider-" + metricProvider.getClass().getSimpleName())); + executor.scheduleWithFixedDelay( + () -> { + samplesRingBuffer.add(metricProvider.getMetricValue()); + currentAverageValue.set( + samplesRingBuffer.toList().stream() + .mapToDouble(Double::doubleValue) + .average() + .orElse(-1)); + }, + 0, + sampleInterval, + TimeUnit.SECONDS); + } + + /** + * Return current average. This is a cached value, so calling this method will not incur any + * calculations + */ + public double getMetricValue() { + return currentAverageValue.get(); + } + + @Override + public void close() { + ExecutorUtil.shutdownAndAwaitTermination(executor); + } + + /** Interface to provide the metric value. */ + public interface MetricProvider { + double getMetricValue(); + } +} diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java index 4b9d24bb694..78841cceaf7 100644 --- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java +++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java @@ -17,6 +17,8 @@ package org.apache.solr.util.circuitbreaker; +import java.io.Closeable; +import java.io.IOException; import java.util.List; import java.util.Locale; import java.util.Set; @@ -41,7 +43,7 @@ * * @lucene.experimental */ -public abstract class CircuitBreaker implements NamedListInitializedPlugin { +public abstract class CircuitBreaker implements NamedListInitializedPlugin, Closeable { // Only query requests are checked by default private Set requestTypes = Set.of(SolrRequestType.QUERY); private final List SUPPORTED_TYPES = @@ -60,6 +62,11 @@ public CircuitBreaker() {} /** Get error message when the circuit breaker triggers */ public abstract String getErrorMessage(); + @Override + public void close() throws IOException { + // Nothing to do by default + } + /** * Set the request types for which this circuit breaker should be checked. If not called, the * circuit breaker will be checked for the {@link SolrRequestType#QUERY} request type only. diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java index 02e3c7af676..3ca0c760a86 100644 --- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java +++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java @@ -17,6 +17,7 @@ package org.apache.solr.util.circuitbreaker; +import java.io.IOException; import java.lang.invoke.MethodHandles; import org.apache.solr.common.util.NamedList; import org.slf4j.Logger; @@ -77,6 +78,19 @@ public void init(NamedList args) { } } + @Override + public void close() throws IOException { + try { + if (memEnabled) { + memCB.close(); + } + } finally { + if (cpuEnabled) { + cpuCB.close(); + } + } + } + // The methods below will be called by super class during init public void setMemEnabled(String enabled) { this.memEnabled = Boolean.getBoolean(enabled); diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java index 84c2f61fb9b..a7081df96f6 100644 --- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java +++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java @@ -18,12 +18,19 @@ package org.apache.solr.util.circuitbreaker; import com.google.common.annotations.VisibleForTesting; +import java.io.Closeable; +import java.io.IOException; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.solr.client.solrj.SolrRequest.SolrRequestType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Keeps track of all registered circuit breaker instances for various request types. Responsible @@ -32,26 +39,35 @@ * @lucene.experimental * @since 9.4 */ -public class CircuitBreakerRegistry { +public class CircuitBreakerRegistry implements Closeable { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final Map> circuitBreakerMap = new HashMap<>(); public CircuitBreakerRegistry() {} public void register(CircuitBreaker circuitBreaker) { - circuitBreaker - .getRequestTypes() - .forEach( - r -> { - List list = - circuitBreakerMap.computeIfAbsent(r, k -> new ArrayList<>()); - list.add(circuitBreaker); - }); + synchronized (circuitBreakerMap) { + circuitBreaker + .getRequestTypes() + .forEach( + r -> { + List list = + circuitBreakerMap.computeIfAbsent(r, k -> new ArrayList<>()); + list.add(circuitBreaker); + if (log.isInfoEnabled()) { + log.info( + "Registered circuit breaker {} for request type(s) {}", + circuitBreaker.getClass().getSimpleName(), + r); + } + }); + } } @VisibleForTesting - public void deregisterAll() { - circuitBreakerMap.clear(); + public void deregisterAll() throws IOException { + this.close(); } /** @@ -97,4 +113,41 @@ public static String toErrorMessage(List circuitBreakerList) { public boolean isEnabled(SolrRequestType requestType) { return circuitBreakerMap.containsKey(requestType); } + + @Override + public void close() throws IOException { + synchronized (circuitBreakerMap) { + final AtomicInteger closeFailedCounter = new AtomicInteger(0); + circuitBreakerMap + .values() + .forEach( + list -> + list.forEach( + it -> { + try { + if (log.isDebugEnabled()) { + log.debug( + "Closed circuit breaker {} for request type(s) {}", + it.getClass().getSimpleName(), + it.getRequestTypes()); + } + it.close(); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error( + String.format( + Locale.ROOT, + "Failed to close circuit breaker %s", + it.getClass().getSimpleName()), + e); + } + closeFailedCounter.incrementAndGet(); + } + })); + circuitBreakerMap.clear(); + if (closeFailedCounter.get() > 0) { + throw new IOException("Failed to close " + closeFailedCounter.get() + " circuit breakers"); + } + } + } } diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java index 3004d732e4d..4a3eb3f5b9f 100644 --- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java +++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java @@ -17,32 +17,64 @@ package org.apache.solr.util.circuitbreaker; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; +import org.apache.solr.util.RefCounted; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Tracks the current JVM heap usage and triggers if it exceeds the defined percentage of the - * maximum heap size allocated to the JVM. This circuit breaker is a part of the default - * CircuitBreakerRegistry so is checked for every request -- hence it is realtime. Once the memory - * usage goes below the threshold, it will start allowing queries again. + * Tracks the current JVM heap usage and triggers if a moving heap usage average over 30 seconds + * exceeds the defined percentage of the maximum heap size allocated to the JVM. Once the average + * memory usage goes below the threshold, it will start allowing queries again. * *

The memory threshold is defined as a percentage of the maximum memory allocated -- see - * memThreshold in solrconfig.xml. + * memThreshold in solrconfig.xml. */ public class MemoryCircuitBreaker extends CircuitBreaker { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final MemoryMXBean MEMORY_MX_BEAN = ManagementFactory.getMemoryMXBean(); + // One shared provider / executor for all instances of this class + private static RefCounted averagingMetricProvider; private long heapMemoryThreshold; private static final ThreadLocal seenMemory = ThreadLocal.withInitial(() -> 0L); private static final ThreadLocal allowedMemory = ThreadLocal.withInitial(() -> 0L); + /** Creates an instance which averages over 6 samples during last 30 seconds. */ public MemoryCircuitBreaker() { + this(6, 5); + } + + /** + * Constructor that allows override of sample interval for which the memory usage is fetched. This + * is provided for testing, not intended for general use because the average metric provider + * implementation is the same for all instances of the class. + * + * @param numSamples number of samples to calculate average for + * @param sampleInterval interval between each sample + */ + protected MemoryCircuitBreaker(int numSamples, int sampleInterval) { super(); + synchronized (MemoryCircuitBreaker.class) { + if (averagingMetricProvider == null || averagingMetricProvider.getRefcount() == 0) { + averagingMetricProvider = + new RefCounted<>( + new AveragingMetricProvider( + () -> MEMORY_MX_BEAN.getHeapMemoryUsage().getUsed(), + numSamples, + sampleInterval)) { + @Override + protected void close() { + get().close(); + } + }; + } + averagingMetricProvider.incref(); + } } public void setThreshold(double thresholdValueInPercentage) { @@ -60,14 +92,11 @@ public void setThreshold(double thresholdValueInPercentage) { } } - // TODO: An optimization can be to trip the circuit breaker for a duration of time - // after the circuit breaker condition is matched. This will optimize for per call - // overhead of calculating the condition parameters but can result in false positives. @Override public boolean isTripped() { long localAllowedMemory = getCurrentMemoryThreshold(); - long localSeenMemory = calculateLiveMemoryUsage(); + long localSeenMemory = getAvgMemoryUsage(); allowedMemory.set(localAllowedMemory); @@ -76,6 +105,10 @@ public boolean isTripped() { return (localSeenMemory >= localAllowedMemory); } + protected long getAvgMemoryUsage() { + return (long) averagingMetricProvider.get().getMetricValue(); + } + @Override public String getErrorMessage() { return "Memory Circuit Breaker triggered as JVM heap usage values are greater than allocated threshold. " @@ -89,17 +122,12 @@ private long getCurrentMemoryThreshold() { return heapMemoryThreshold; } - /** - * Calculate the live memory usage for the system. This method has package visibility to allow - * using for testing. - * - * @return Memory usage in bytes. - */ - protected long calculateLiveMemoryUsage() { - // NOTE: MemoryUsageGaugeSet provides memory usage statistics but we do not use them - // here since it will require extra allocations and incur cost, hence it is cheaper to use - // MemoryMXBean directly. Ideally, this call should not add noticeable - // latency to a query -- but if it does, please signify on SOLR-14588 - return MEMORY_MX_BEAN.getHeapMemoryUsage().getUsed(); + @Override + public void close() throws IOException { + synchronized (MemoryCircuitBreaker.class) { + if (averagingMetricProvider != null && averagingMetricProvider.getRefcount() > 0) { + averagingMetricProvider.decref(); + } + } } } diff --git a/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java b/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java index 71c6fe67f8d..14c83df771a 100644 --- a/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java +++ b/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java @@ -19,6 +19,7 @@ import static org.hamcrest.CoreMatchers.containsString; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; @@ -43,6 +44,8 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final CircuitBreaker dummyMemBreaker = new MemoryCircuitBreaker(); + private static final CircuitBreaker dummyCBManager = new CircuitBreakerManager(); protected static void indexDocs() { removeAllExistingCircuitBreakers(); @@ -62,11 +65,13 @@ protected static void indexDocs() { @Override public void tearDown() throws Exception { super.tearDown(); + dummyMemBreaker.close(); + dummyCBManager.close(); } @After public void after() { - h.getCore().getCircuitBreakerRegistry().deregisterAll(); + removeAllExistingCircuitBreakers(); } public void testCBAlwaysTrips() { @@ -116,9 +121,10 @@ public void testCBFakeMemoryPressure() throws Exception { } public void testBadRequestType() { + expectThrows( IllegalArgumentException.class, - () -> new MemoryCircuitBreaker().setRequestTypes(List.of("badRequestType"))); + () -> dummyMemBreaker.setRequestTypes(List.of("badRequestType"))); } public void testBuildingMemoryPressure() { @@ -236,17 +242,21 @@ public void testResponseWithCBTiming() { "//lst[@name='process']/double[@name='time']"); } - public void testErrorCode() { + public void testErrorCode() throws Exception { assertEquals( SolrException.ErrorCode.SERVICE_UNAVAILABLE, - CircuitBreaker.getErrorCode(List.of(new CircuitBreakerManager()))); + CircuitBreaker.getErrorCode(List.of(dummyCBManager))); assertEquals( SolrException.ErrorCode.TOO_MANY_REQUESTS, - CircuitBreaker.getErrorCode(List.of(new MemoryCircuitBreaker()))); + CircuitBreaker.getErrorCode(List.of(dummyMemBreaker))); } private static void removeAllExistingCircuitBreakers() { - h.getCore().getCircuitBreakerRegistry().deregisterAll(); + try { + h.getCore().getCircuitBreakerRegistry().deregisterAll(); + } catch (IOException e) { + fail("Failed to unload circuit breakers"); + } } private static class MockCircuitBreaker extends MemoryCircuitBreaker { @@ -264,10 +274,12 @@ public boolean isTripped() { } private static class FakeMemoryPressureCircuitBreaker extends MemoryCircuitBreaker { + public FakeMemoryPressureCircuitBreaker() { + super(1, 1); + } @Override - protected long calculateLiveMemoryUsage() { - // Return a number large enough to trigger a pushback from the circuit breaker + protected long getAvgMemoryUsage() { return Long.MAX_VALUE; } } @@ -276,11 +288,12 @@ private static class BuildingUpMemoryPressureCircuitBreaker extends MemoryCircui private AtomicInteger count; public BuildingUpMemoryPressureCircuitBreaker() { + super(1, 1); this.count = new AtomicInteger(0); } @Override - protected long calculateLiveMemoryUsage() { + protected long getAvgMemoryUsage() { int localCount = count.getAndIncrement(); if (localCount >= 4) {