From fdfc9f09cc66516754135529d6e691c2a624021b Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Mon, 26 Feb 2024 23:18:43 +0100 Subject: [PATCH 01/13] SOLR-17160: Time based tracking of core admin requests with Caffeine cache --- .../solr/handler/admin/CoreAdminHandler.java | 125 ++++++++++++------ .../admin/api/GetNodeCommandStatus.java | 33 +++-- .../handler/admin/CoreAdminHandlerTest.java | 41 ++++++ .../admin/api/GetNodeCommandStatusTest.java | 9 +- 4 files changed, 144 insertions(+), 64 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index d2d823eb190..4167c3dde37 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -21,20 +21,23 @@ import static org.apache.solr.security.PermissionNameProvider.Name.CORE_EDIT_PERM; import static org.apache.solr.security.PermissionNameProvider.Name.CORE_READ_PERM; +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.Expiry; +import com.github.benmanes.caffeine.cache.Ticker; import java.io.File; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; import org.apache.solr.api.AnnotatedApi; import org.apache.solr.api.Api; import org.apache.solr.api.JerseyResource; @@ -427,11 +430,11 @@ default boolean isExpensive() { } public static class CoreAdminAsyncTracker { - private static final int MAX_TRACKED_REQUESTS = 100; public static final String RUNNING = "running"; public static final String COMPLETED = "completed"; public static final String FAILED = "failed"; - public final Map> requestStatusMap; + + private final Cache requestStatusCache; // Executor for all standard tasks (the ones that are not flagged as expensive) // We always keep 50 live threads @@ -448,11 +451,23 @@ public static class CoreAdminAsyncTracker { new SolrNamedThreadFactory("parallelCoreAdminAPIExpensiveExecutor")); public CoreAdminAsyncTracker() { - HashMap> map = new HashMap<>(3, 1.0f); - map.put(RUNNING, Collections.synchronizedMap(new LinkedHashMap<>())); - map.put(COMPLETED, Collections.synchronizedMap(new LinkedHashMap<>())); - map.put(FAILED, Collections.synchronizedMap(new LinkedHashMap<>())); - requestStatusMap = Collections.unmodifiableMap(map); + this( + Ticker.systemTicker(), + TimeUnit.MINUTES.toNanos( + Long.getLong("solr.admin.requests.running.timeout.minutes", 60L)), + TimeUnit.MINUTES.toNanos( + Long.getLong("solr.admin.requests.completed.timeout.minutes", 5L))); + } + + /** + * @param runningTimeoutNanos The time-to-keep for tasks in the RUNNING state. + * @param completedTimeoutNanos The time-to-keep for tasks in the COMPLETED or FAILED state + * after the status was polled. + */ + CoreAdminAsyncTracker(Ticker ticker, long runningTimeoutNanos, long completedTimeoutNanos) { + + TaskExpiry expiry = new TaskExpiry(runningTimeoutNanos, completedTimeoutNanos); + requestStatusCache = Caffeine.newBuilder().ticker(ticker).expireAfter(expiry).build(); } public void shutdown() { @@ -460,13 +475,22 @@ public void shutdown() { ExecutorUtil.shutdownAndAwaitTermination(expensiveExecutor); } - public Map getRequestStatusMap(String key) { - return requestStatusMap.get(key); + public TaskObject getAsyncRequestForStatus(String key) { + TaskObject task = requestStatusCache.getIfPresent(key); + + if (task != null && !RUNNING.equals(task.status) && !task.polled) { + task.polled = true; + // At the first time we retrieve the status of a completed request, do a second lookup in + // the cache. This is necessary to update the TTL of this request in the cache. + // Unfortunately, we can't force the expiration time to be refreshed without a lookup. + requestStatusCache.getIfPresent(key); + } + + return task; } public void submitAsyncTask(TaskObject taskObject) throws SolrException { - ensureTaskIdNotInUse(taskObject.taskId); - addTask(RUNNING, taskObject); + addTask(taskObject); Runnable command = () -> { @@ -497,42 +521,19 @@ public void submitAsyncTask(TaskObject taskObject) throws SolrException { } } - /** Helper method to add a task to a tracking type. */ - private void addTask(String type, TaskObject o, boolean limit) { - synchronized (getRequestStatusMap(type)) { - if (limit && getRequestStatusMap(type).size() == MAX_TRACKED_REQUESTS) { - String key = getRequestStatusMap(type).entrySet().iterator().next().getKey(); - getRequestStatusMap(type).remove(key); - } - addTask(type, o); - } - } - - private void addTask(String type, TaskObject o) { - synchronized (getRequestStatusMap(type)) { - getRequestStatusMap(type).put(o.taskId, o); - } - } - - /** Helper method to remove a task from a tracking map. */ - private void removeTask(String map, String taskId) { - synchronized (getRequestStatusMap(map)) { - getRequestStatusMap(map).remove(taskId); - } - } - - private void ensureTaskIdNotInUse(String taskId) throws SolrException { - if (getRequestStatusMap(RUNNING).containsKey(taskId) - || getRequestStatusMap(COMPLETED).containsKey(taskId) - || getRequestStatusMap(FAILED).containsKey(taskId)) { + private void addTask(TaskObject taskObject) { + // Ensure task ID is not already in use + if (requestStatusCache.asMap().containsKey(taskObject.taskId)) { throw new SolrException( ErrorCode.BAD_REQUEST, "Duplicate request with the same requestid found."); } + + taskObject.status = RUNNING; + requestStatusCache.put(taskObject.taskId, taskObject); } private void finishTask(TaskObject taskObject, boolean successful) { - removeTask(RUNNING, taskObject.taskId); - addTask(successful ? COMPLETED : FAILED, taskObject, true); + taskObject.status = successful ? COMPLETED : FAILED; } /** @@ -546,6 +547,8 @@ public static class TaskObject { final Callable task; public String rspInfo; public Object operationRspInfo; + private volatile String status; + private volatile boolean polled; public TaskObject( String taskId, String action, boolean expensive, Callable task) { @@ -574,6 +577,42 @@ public Object getOperationRspObject() { public void setOperationRspObject(SolrQueryResponse rspObject) { this.operationRspInfo = rspObject.getResponse(); } + + public String getStatus() { + return status; + } + } + + /** + * Expiration policy for Caffeine cache. Depending on whether the status of a completed task was + * already retrieved, we return {@link #runningTimeoutNanos} or {@link #completedTimeoutNanos}. + */ + private static class TaskExpiry implements Expiry { + + private final long runningTimeoutNanos; + private final long completedTimeoutNanos; + + private TaskExpiry(long runningTimeoutNanos, long completedTimeoutNanos) { + this.runningTimeoutNanos = runningTimeoutNanos; + this.completedTimeoutNanos = completedTimeoutNanos; + } + + @Override + public long expireAfterCreate(String key, TaskObject task, long currentTime) { + return runningTimeoutNanos; + } + + @Override + public long expireAfterUpdate( + String key, TaskObject task, long currentTime, long currentDuration) { + return task.polled ? completedTimeoutNanos : runningTimeoutNanos; + } + + @Override + public long expireAfterRead( + String key, TaskObject task, long currentTime, long currentDuration) { + return task.polled ? completedTimeoutNanos : runningTimeoutNanos; + } } } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java b/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java index c6627d012e3..36758fbf2e9 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java @@ -18,7 +18,6 @@ import static org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.COMPLETED; import static org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.FAILED; -import static org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.RUNNING; import jakarta.inject.Inject; import org.apache.solr.client.api.endpoint.GetNodeCommandStatusApi; @@ -26,6 +25,7 @@ import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.core.CoreContainer; import org.apache.solr.handler.admin.CoreAdminHandler; +import org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.TaskObject; import org.apache.solr.jersey.PermissionName; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; @@ -51,25 +51,24 @@ public GetNodeCommandStatus( public GetNodeCommandStatusResponse getCommandStatus(String requestId) { ensureRequiredParameterProvided(CoreAdminParams.REQUESTID, requestId); var requestStatusResponse = new GetNodeCommandStatusResponse(); - if (coreAdminAsyncTracker.getRequestStatusMap(RUNNING).containsKey(requestId)) { - requestStatusResponse.status = RUNNING; - } else if (coreAdminAsyncTracker.getRequestStatusMap(COMPLETED).containsKey(requestId)) { - requestStatusResponse.status = COMPLETED; - requestStatusResponse.response = - coreAdminAsyncTracker.getRequestStatusMap(COMPLETED).get(requestId).getRspObject(); - requestStatusResponse.response = - coreAdminAsyncTracker - .getRequestStatusMap(COMPLETED) - .get(requestId) - .getOperationRspObject(); - } else if (coreAdminAsyncTracker.getRequestStatusMap(FAILED).containsKey(requestId)) { - requestStatusResponse.status = FAILED; - requestStatusResponse.response = - coreAdminAsyncTracker.getRequestStatusMap(FAILED).get(requestId).getRspObject(); - } else { + + TaskObject taskObject = coreAdminAsyncTracker.getAsyncRequestForStatus(requestId); + String status = taskObject != null ? taskObject.getStatus() : null; + + if (status == null) { requestStatusResponse.status = "notfound"; requestStatusResponse.msg = "No task found in running, completed or failed tasks"; + } else { + requestStatusResponse.status = status; + + if (taskObject.getStatus().equals(COMPLETED)) { + requestStatusResponse.response = taskObject.getRspObject(); + requestStatusResponse.response = taskObject.getOperationRspObject(); + } else if (taskObject.getStatus().equals(FAILED)) { + requestStatusResponse.response = taskObject.getRspObject(); + } } + return requestStatusResponse; } } diff --git a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java index 68a31ca6879..1b2f1e3ff75 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java @@ -16,6 +16,8 @@ */ package org.apache.solr.handler.admin; +import static org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.COMPLETED; + import java.io.Reader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -24,7 +26,9 @@ import java.nio.file.StandardCopyOption; import java.util.Map; import java.util.Properties; +import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.io.file.PathUtils; import org.apache.lucene.util.Constants; import org.apache.solr.SolrTestCaseJ4; @@ -43,6 +47,8 @@ import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.SolrCore; import org.apache.solr.embedded.JettySolrRunner; +import org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker; +import org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.TaskObject; import org.apache.solr.response.SolrQueryResponse; import org.junit.BeforeClass; import org.junit.Test; @@ -528,4 +534,39 @@ public void testNonexistentCoreReload() throws Exception { e.getMessage()); admin.close(); } + + @Test + public void testTrackedRequestExpiration() throws Exception { + // Create a tracker with controlled clock, relative to 0 + AtomicLong clock = new AtomicLong(0L); + CoreAdminAsyncTracker asyncTracker = new CoreAdminAsyncTracker(clock::get, 100L, 10L); + try { + Set tasks = + Set.of( + new TaskObject("id1", "ACTION", false, SolrQueryResponse::new), + new TaskObject("id2", "ACTION", false, SolrQueryResponse::new)); + + // Submit all tasks and wait for internal status to be COMPLETED + tasks.forEach(asyncTracker::submitAsyncTask); + while (!tasks.stream().allMatch(t -> COMPLETED.equals(t.getStatus()))) { + Thread.sleep(10L); + } + + // Timeout for running tasks is 100n, so status can be retrieved after 20n. + // But timeout for complete tasks is 10n once we polled the status at least once, so status + // is not available anymore 20n later. + clock.set(20); + assertEquals(COMPLETED, asyncTracker.getAsyncRequestForStatus("id1").getStatus()); + clock.set(40L); + assertNull(asyncTracker.getAsyncRequestForStatus("id1")); + + // Move the clock after the running timeout. + // Status of second task is not available anymore, even if it wasn't retrieved yet + clock.set(110L); + assertNull(asyncTracker.getAsyncRequestForStatus("id2")); + + } finally { + asyncTracker.shutdown(); + } + } } diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/GetNodeCommandStatusTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/GetNodeCommandStatusTest.java index 4a52383bf20..b22d3d4d4ec 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/api/GetNodeCommandStatusTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/api/GetNodeCommandStatusTest.java @@ -19,12 +19,12 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import java.util.Map; import org.apache.solr.SolrTestCase; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.api.endpoint.GetNodeCommandStatusApi; import org.apache.solr.core.CoreContainer; import org.apache.solr.handler.admin.CoreAdminHandler; +import org.apache.solr.handler.admin.CoreAdminHandler.CoreAdminAsyncTracker.TaskObject; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -33,7 +33,6 @@ public class GetNodeCommandStatusTest extends SolrTestCase { private CoreContainer mockCoreContainer; private CoreAdminHandler.CoreAdminAsyncTracker mockAsyncTracker; - private CoreAdminHandler.CoreAdminAsyncTracker.TaskObject taskObject; private GetNodeCommandStatusApi requestNodeCommandApi; @BeforeClass @@ -45,7 +44,6 @@ public static void ensureWorkingMockito() { public void setupMocks() { mockCoreContainer = mock(CoreContainer.class); mockAsyncTracker = mock(CoreAdminHandler.CoreAdminAsyncTracker.class); - taskObject = new CoreAdminHandler.CoreAdminAsyncTracker.TaskObject(null, null, false, null); requestNodeCommandApi = new GetNodeCommandStatus(mockCoreContainer, mockAsyncTracker, null, null); } @@ -89,6 +87,9 @@ public void testReturnsStatusOfFailedCommandId() { } private void whenTaskExistsWithStatus(String taskId, String status) { - when(mockAsyncTracker.getRequestStatusMap(status)).thenReturn(Map.of(taskId, taskObject)); + TaskObject taskObject = mock(TaskObject.class); + when(taskObject.getStatus()).thenReturn(status); + + when(mockAsyncTracker.getAsyncRequestForStatus(taskId)).thenReturn(taskObject); } } From 8df6c8cfaac14340f990eace25da4845ac58e36b Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Wed, 28 Feb 2024 19:46:41 +0100 Subject: [PATCH 02/13] Added max size to the cache --- .../solr/handler/admin/CoreAdminHandler.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 4167c3dde37..52200af8632 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -430,6 +430,13 @@ default boolean isExpensive() { } public static class CoreAdminAsyncTracker { + /** + * Max number of requests we track in the Caffeine cache. This limit is super high on + * purpose, we're not supposed to hit it. This is just a protection to grow in memory too + * much when receiving an abusive number of admin requests. + */ + private static final int MAX_TRACKED_REQUESTS = 10_000; + public static final String RUNNING = "running"; public static final String COMPLETED = "completed"; public static final String FAILED = "failed"; @@ -467,7 +474,12 @@ public CoreAdminAsyncTracker() { CoreAdminAsyncTracker(Ticker ticker, long runningTimeoutNanos, long completedTimeoutNanos) { TaskExpiry expiry = new TaskExpiry(runningTimeoutNanos, completedTimeoutNanos); - requestStatusCache = Caffeine.newBuilder().ticker(ticker).expireAfter(expiry).build(); + requestStatusCache = + Caffeine.newBuilder() + .ticker(ticker) + .maximumSize(MAX_TRACKED_REQUESTS) + .expireAfter(expiry) + .build(); } public void shutdown() { From ddd45a6828c1d7844f53b32dbe227bf9acb20dc3 Mon Sep 17 00:00:00 2001 From: Pierre Salagnac <82967811+psalagnac@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:42:28 +0200 Subject: [PATCH 03/13] Update solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java Co-authored-by: David Smiley --- .../java/org/apache/solr/handler/admin/CoreAdminHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 52200af8632..25adb7d365e 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -441,7 +441,7 @@ public static class CoreAdminAsyncTracker { public static final String COMPLETED = "completed"; public static final String FAILED = "failed"; - private final Cache requestStatusCache; + private final Cache requestStatusCache; // key by ID // Executor for all standard tasks (the ones that are not flagged as expensive) // We always keep 50 live threads From eda2c059299e8c664fc34f45e803cc1976e4d97b Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Mon, 10 Jun 2024 16:41:30 +0200 Subject: [PATCH 04/13] Use EnvUtils --- .../apache/solr/handler/admin/CoreAdminHandler.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 25adb7d365e..47637f1989b 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -50,6 +50,7 @@ import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.EnvUtils; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SolrNamedThreadFactory; @@ -431,9 +432,9 @@ default boolean isExpensive() { public static class CoreAdminAsyncTracker { /** - * Max number of requests we track in the Caffeine cache. This limit is super high on - * purpose, we're not supposed to hit it. This is just a protection to grow in memory too - * much when receiving an abusive number of admin requests. + * Max number of requests we track in the Caffeine cache. This limit is super high on purpose, + * we're not supposed to hit it. This is just a protection to grow in memory too much when + * receiving an abusive number of admin requests. */ private static final int MAX_TRACKED_REQUESTS = 10_000; @@ -461,9 +462,9 @@ public CoreAdminAsyncTracker() { this( Ticker.systemTicker(), TimeUnit.MINUTES.toNanos( - Long.getLong("solr.admin.requests.running.timeout.minutes", 60L)), + EnvUtils.getEnvAsLong("solr.admin.requests.running.timeout.minutes", 60L)), TimeUnit.MINUTES.toNanos( - Long.getLong("solr.admin.requests.completed.timeout.minutes", 5L))); + EnvUtils.getEnvAsLong("solr.admin.requests.completed.timeout.minutes", 5L))); } /** From b29b49893eeb30ab1c003453ed4040d22dd1b348 Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Mon, 10 Jun 2024 16:45:07 +0200 Subject: [PATCH 05/13] Avoid useless method calls. --- .../apache/solr/handler/admin/api/GetNodeCommandStatus.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java b/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java index 36758fbf2e9..258e83e16a0 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/GetNodeCommandStatus.java @@ -61,10 +61,10 @@ public GetNodeCommandStatusResponse getCommandStatus(String requestId) { } else { requestStatusResponse.status = status; - if (taskObject.getStatus().equals(COMPLETED)) { + if (status.equals(COMPLETED)) { requestStatusResponse.response = taskObject.getRspObject(); requestStatusResponse.response = taskObject.getOperationRspObject(); - } else if (taskObject.getStatus().equals(FAILED)) { + } else if (status.equals(FAILED)) { requestStatusResponse.response = taskObject.getRspObject(); } } From 7e87990b40f6010bdbf116a82a2e45353cd740ca Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Mon, 10 Jun 2024 16:54:19 +0200 Subject: [PATCH 06/13] Rename and document flag. --- .../solr/handler/admin/CoreAdminHandler.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 47637f1989b..44c9b94cb99 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -491,8 +491,8 @@ public void shutdown() { public TaskObject getAsyncRequestForStatus(String key) { TaskObject task = requestStatusCache.getIfPresent(key); - if (task != null && !RUNNING.equals(task.status) && !task.polled) { - task.polled = true; + if (task != null && !RUNNING.equals(task.status) && !task.polledAfterCompletion) { + task.polledAfterCompletion = true; // At the first time we retrieve the status of a completed request, do a second lookup in // the cache. This is necessary to update the TTL of this request in the cache. // Unfortunately, we can't force the expiration time to be refreshed without a lookup. @@ -561,7 +561,12 @@ public static class TaskObject { public String rspInfo; public Object operationRspInfo; private volatile String status; - private volatile boolean polled; + + /** + * Flag set to true once the task is complete (can be in error) and the status was polled + * already once. Once set, the time we keep the task status is shortened. + */ + private volatile boolean polledAfterCompletion; public TaskObject( String taskId, String action, boolean expensive, Callable task) { @@ -618,13 +623,13 @@ public long expireAfterCreate(String key, TaskObject task, long currentTime) { @Override public long expireAfterUpdate( String key, TaskObject task, long currentTime, long currentDuration) { - return task.polled ? completedTimeoutNanos : runningTimeoutNanos; + return task.polledAfterCompletion ? completedTimeoutNanos : runningTimeoutNanos; } @Override public long expireAfterRead( String key, TaskObject task, long currentTime, long currentDuration) { - return task.polled ? completedTimeoutNanos : runningTimeoutNanos; + return task.polledAfterCompletion ? completedTimeoutNanos : runningTimeoutNanos; } } } From 09c3430080d6a1b9ce6d60f9b022907719eb502b Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Mon, 10 Jun 2024 17:15:52 +0200 Subject: [PATCH 07/13] Fix race condition when adding two requests with same name --- .../solr/handler/admin/CoreAdminHandler.java | 6 ++++- .../handler/admin/CoreAdminHandlerTest.java | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 44c9b94cb99..99cd211f288 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -536,7 +536,11 @@ public void submitAsyncTask(TaskObject taskObject) throws SolrException { private void addTask(TaskObject taskObject) { // Ensure task ID is not already in use - if (requestStatusCache.asMap().containsKey(taskObject.taskId)) { + TaskObject taskInCache = requestStatusCache.get(taskObject.taskId, n -> taskObject); + + // If we get a different task instance, it means one was already in the cache with the + // same name. Just reject the new one. + if (taskInCache != taskObject) { throw new SolrException( ErrorCode.BAD_REQUEST, "Duplicate request with the same requestid found."); } diff --git a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java index 1b2f1e3ff75..d62b2048074 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java @@ -569,4 +569,26 @@ public void testTrackedRequestExpiration() throws Exception { asyncTracker.shutdown(); } } + + /** Check we reject a task is the async ID already exists. */ + @Test + public void testDuplicatedRequestId() { + + // Different tasks but with same ID + TaskObject task1 = new TaskObject("id1", "ACTION", false, null); + TaskObject task2 = new TaskObject("id1", "ACTION", false, null); + + CoreAdminAsyncTracker asyncTracker = new CoreAdminAsyncTracker(); + try { + asyncTracker.submitAsyncTask(task1); + try { + asyncTracker.submitAsyncTask(task2); + fail("Task should have been rejected."); + } catch (SolrException e) { + assertEquals("Duplicate request with the same requestid found.", e.getMessage()); + } + } finally { + asyncTracker.shutdown(); + } + } } From 525aa3530db1fa7248d2db20ed588971ea1c56fe Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Thu, 27 Jun 2024 12:24:21 +0200 Subject: [PATCH 08/13] Replace getEnvAsLong by getPropertyAsLong --- .../java/org/apache/solr/handler/admin/CoreAdminHandler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 99cd211f288..e46f6c4d03f 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -462,9 +462,9 @@ public CoreAdminAsyncTracker() { this( Ticker.systemTicker(), TimeUnit.MINUTES.toNanos( - EnvUtils.getEnvAsLong("solr.admin.requests.running.timeout.minutes", 60L)), + EnvUtils.getPropertyAsLong("solr.admin.requests.running.timeout.minutes", 60L)), TimeUnit.MINUTES.toNanos( - EnvUtils.getEnvAsLong("solr.admin.requests.completed.timeout.minutes", 5L))); + EnvUtils.getPropertyAsLong("solr.admin.requests.completed.timeout.minutes", 5L))); } /** From c4393771de733e245a4ea37a86165a308cc24de8 Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Thu, 27 Jun 2024 12:30:48 +0200 Subject: [PATCH 09/13] Add config property for cache size --- .../solr/handler/admin/CoreAdminHandler.java | 3 ++- .../java/org/apache/solr/common/util/EnvUtils.java | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index e46f6c4d03f..30f092ecb73 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -436,7 +436,8 @@ public static class CoreAdminAsyncTracker { * we're not supposed to hit it. This is just a protection to grow in memory too much when * receiving an abusive number of admin requests. */ - private static final int MAX_TRACKED_REQUESTS = 10_000; + private static final int MAX_TRACKED_REQUESTS = + EnvUtils.getPropertyAsInteger("solr.admin.requests.running.max", 10_000); public static final String RUNNING = "running"; public static final String COMPLETED = "completed"; diff --git a/solr/solrj/src/java/org/apache/solr/common/util/EnvUtils.java b/solr/solrj/src/java/org/apache/solr/common/util/EnvUtils.java index 29c9586ca96..35c61360b83 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/EnvUtils.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/EnvUtils.java @@ -178,6 +178,20 @@ private static String camelCaseToDotSeparated(String key) { } /** Get property as integer */ + public static Integer getPropertyAsInteger(String key) { + return getPropertyAsInteger(key, null); + } + + /** Get property as integer, or default value */ + public static Integer getPropertyAsInteger(String key, Integer defaultValue) { + String value = getProperty(key); + if (value == null) { + return defaultValue; + } + return Integer.parseInt(value); + } + + /** Get property as long */ public static Long getPropertyAsLong(String key) { return getPropertyAsLong(key, null); } From ec33077fc1fdc4f169481c161080d6961db8cccd Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Thu, 27 Jun 2024 12:45:56 +0200 Subject: [PATCH 10/13] Remove duplicate put() to the cache --- .../org/apache/solr/handler/admin/CoreAdminHandler.java | 9 +++++++-- .../apache/solr/handler/admin/CoreAdminHandlerTest.java | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 30f092ecb73..68062edae5a 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -537,7 +537,13 @@ public void submitAsyncTask(TaskObject taskObject) throws SolrException { private void addTask(TaskObject taskObject) { // Ensure task ID is not already in use - TaskObject taskInCache = requestStatusCache.get(taskObject.taskId, n -> taskObject); + TaskObject taskInCache = + requestStatusCache.get( + taskObject.taskId, + n -> { + taskObject.status = RUNNING; + return taskObject; + }); // If we get a different task instance, it means one was already in the cache with the // same name. Just reject the new one. @@ -547,7 +553,6 @@ private void addTask(TaskObject taskObject) { } taskObject.status = RUNNING; - requestStatusCache.put(taskObject.taskId, taskObject); } private void finishTask(TaskObject taskObject, boolean successful) { diff --git a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java index daf3dae1162..2dd9fce1224 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/CoreAdminHandlerTest.java @@ -591,6 +591,9 @@ public void testDuplicatedRequestId() { } catch (SolrException e) { assertEquals("Duplicate request with the same requestid found.", e.getMessage()); } + + assertNotNull(task1.getStatus()); + assertNull(task2.getStatus()); } finally { asyncTracker.shutdown(); } From 9629e2a385b2629b86272b8552079529560981f2 Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Thu, 4 Jul 2024 15:54:15 +0200 Subject: [PATCH 11/13] Remove useless assignment --- .../java/org/apache/solr/handler/admin/CoreAdminHandler.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 68062edae5a..d78d92e53b5 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -551,8 +551,6 @@ private void addTask(TaskObject taskObject) { throw new SolrException( ErrorCode.BAD_REQUEST, "Duplicate request with the same requestid found."); } - - taskObject.status = RUNNING; } private void finishTask(TaskObject taskObject, boolean successful) { From a1b4ad375585db57372d88e2b7ed98c3037c22b0 Mon Sep 17 00:00:00 2001 From: Pierre Salagnac Date: Thu, 4 Jul 2024 15:57:08 +0200 Subject: [PATCH 12/13] Rename config properties --- .../org/apache/solr/handler/admin/CoreAdminHandler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index d78d92e53b5..c35117fb088 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -437,7 +437,7 @@ public static class CoreAdminAsyncTracker { * receiving an abusive number of admin requests. */ private static final int MAX_TRACKED_REQUESTS = - EnvUtils.getPropertyAsInteger("solr.admin.requests.running.max", 10_000); + EnvUtils.getPropertyAsInteger("solr.admin.async.max", 10_000); public static final String RUNNING = "running"; public static final String COMPLETED = "completed"; @@ -452,7 +452,7 @@ public static class CoreAdminAsyncTracker { 50, new SolrNamedThreadFactory("parallelCoreAdminAPIBaseExecutor")); // Executor for expensive tasks - // We keep the number number of max threads very low to have throttling for expensive tasks + // We keep the number of max threads very low to have throttling for expensive tasks private ExecutorService expensiveExecutor = ExecutorUtil.newMDCAwareCachedThreadPool( 5, @@ -463,9 +463,9 @@ public CoreAdminAsyncTracker() { this( Ticker.systemTicker(), TimeUnit.MINUTES.toNanos( - EnvUtils.getPropertyAsLong("solr.admin.requests.running.timeout.minutes", 60L)), + EnvUtils.getPropertyAsLong("solr.admin.async.timeout.minutes", 60L)), TimeUnit.MINUTES.toNanos( - EnvUtils.getPropertyAsLong("solr.admin.requests.completed.timeout.minutes", 5L))); + EnvUtils.getPropertyAsLong("solr.admin.async.timeout.completed.minutes", 5L))); } /** From 42c257543e47839a3569d07c8f99da0c9997ca65 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 12 Jul 2024 17:14:22 -0400 Subject: [PATCH 13/13] CHANGES.txt --- solr/CHANGES.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8216966d3df..0f7ef2caa9f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -148,6 +148,10 @@ Improvements * SOLR-17346: Synchronise stopwords from snowball with those in Lucene (Alastair Porter via Houston Putman) +* SOLR-17160: Core Admin "async" request status tracking is no longer capped at 100; it's 10k. + Statuses are now removed 5 minutes after the read of a completed/failed status. Helps collection + async backup/restore and other operations scale to 100+ shards. (Pierre Salagnac, David Smiley) + Optimizations --------------------- * SOLR-17257: Both Minimize Cores and the Affinity replica placement strategies would over-gather