From 234e70d6174cbc6db31c43a3d1da982af1ec2130 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Fri, 30 Nov 2018 16:06:58 -0500 Subject: [PATCH] Tasks: Retry if task can't be written (#35054) Adds about a minute worth of backoffs and retries to saving task results so it is *much* more likely that a busy cluster won't lose task results. This isn't an ideal solution to losing task results, but it is an incremental improvement. If all of the retries fail when still log the task result, but that is far from ideal. Closes #33764 --- .../tasks/TaskResultsService.java | 34 ++++- .../node/tasks/TaskStorageRetryIT.java | 127 ++++++++++++++++++ .../tasks/TaskResultsServiceTests.java | 40 ++++++ 3 files changed, 195 insertions(+), 6 deletions(-) create mode 100644 server/src/test/java/org/elasticsearch/action/admin/cluster/node/tasks/TaskStorageRetryIT.java create mode 100644 server/src/test/java/org/elasticsearch/tasks/TaskResultsServiceTests.java diff --git a/server/src/main/java/org/elasticsearch/tasks/TaskResultsService.java b/server/src/main/java/org/elasticsearch/tasks/TaskResultsService.java index b968d17d7e94a..b05e87db91943 100644 --- a/server/src/main/java/org/elasticsearch/tasks/TaskResultsService.java +++ b/server/src/main/java/org/elasticsearch/tasks/TaskResultsService.java @@ -27,7 +27,7 @@ import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.indices.create.CreateIndexRequest; import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; -import org.elasticsearch.action.admin.indices.create.TransportCreateIndexAction; +import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.action.support.master.AcknowledgedResponse; @@ -40,18 +40,23 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.core.internal.io.Streams; +import org.elasticsearch.threadpool.ThreadPool; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.Iterator; import java.util.Map; +import static org.elasticsearch.common.unit.TimeValue.timeValueMillis; import static org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskAction.TASKS_ORIGIN; /** @@ -71,18 +76,24 @@ public class TaskResultsService { public static final int TASK_RESULT_MAPPING_VERSION = 2; + /** + * The backoff policy to use when saving a task result fails. The total wait + * time is 600000 milliseconds, ten minutes. + */ + static final BackoffPolicy STORE_BACKOFF_POLICY = + BackoffPolicy.exponentialBackoff(timeValueMillis(250), 14); + private final Client client; private final ClusterService clusterService; - private final TransportCreateIndexAction createIndexAction; + private final ThreadPool threadPool; @Inject - public TaskResultsService(Client client, ClusterService clusterService, - TransportCreateIndexAction createIndexAction) { + public TaskResultsService(Client client, ClusterService clusterService, ThreadPool threadPool) { this.client = new OriginSettingClient(client, TASKS_ORIGIN); this.clusterService = clusterService; - this.createIndexAction = createIndexAction; + this.threadPool = threadPool; } public void storeResult(TaskResult taskResult, ActionListener listener) { @@ -161,6 +172,10 @@ private void doStoreResult(TaskResult taskResult, ActionListener listener) } catch (IOException e) { throw new ElasticsearchException("Couldn't convert task result to XContent for [{}]", e, taskResult.getTask()); } + doStoreResult(STORE_BACKOFF_POLICY.iterator(), index, listener); + } + + private void doStoreResult(Iterator backoff, IndexRequestBuilder index, ActionListener listener) { index.execute(new ActionListener() { @Override public void onResponse(IndexResponse indexResponse) { @@ -169,7 +184,14 @@ public void onResponse(IndexResponse indexResponse) { @Override public void onFailure(Exception e) { - listener.onFailure(e); + if (false == (e instanceof EsRejectedExecutionException) + || false == backoff.hasNext()) { + listener.onFailure(e); + } else { + TimeValue wait = backoff.next(); + logger.warn(() -> new ParameterizedMessage("failed to store task result, retrying in [{}]", wait), e); + threadPool.schedule(wait, ThreadPool.Names.SAME, () -> doStoreResult(backoff, index, listener)); + } } }); } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/node/tasks/TaskStorageRetryIT.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/node/tasks/TaskStorageRetryIT.java new file mode 100644 index 0000000000000..a2e645b457a8a --- /dev/null +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/node/tasks/TaskStorageRetryIT.java @@ -0,0 +1,127 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.action.admin.cluster.node.tasks; + +import org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskResponse; +import org.elasticsearch.action.support.PlainListenableActionFuture; +import org.elasticsearch.client.node.NodeClient; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.tasks.Task; +import org.elasticsearch.tasks.TaskId; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.Arrays; +import java.util.Collection; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.TimeUnit; + +import static java.util.Collections.emptyMap; +import static java.util.Collections.singletonMap; + +/** + * Makes sure that tasks that attempt to store themselves on completion retry if + * they don't succeed at first. + */ +public class TaskStorageRetryIT extends ESSingleNodeTestCase { + @Override + protected Collection> getPlugins() { + return Arrays.asList(TestTaskPlugin.class); + } + + /** + * Lower the queue sizes to be small enough that both bulk and searches will time out and have to be retried. + */ + @Override + protected Settings nodeSettings() { + return Settings.builder() + .put(super.nodeSettings()) + .put("thread_pool.write.size", 2) + .put("thread_pool.write.queue_size", 0) + .build(); + } + + public void testRetry() throws Exception { + logger.info("block the write executor"); + CyclicBarrier barrier = new CyclicBarrier(2); + getInstanceFromNode(ThreadPool.class).executor(ThreadPool.Names.WRITE).execute(() -> { + try { + barrier.await(); + logger.info("blocking the write executor"); + barrier.await(); + logger.info("unblocked the write executor"); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + barrier.await(); + Task task; + PlainListenableActionFuture future = + PlainListenableActionFuture.newListenableFuture(); + try { + logger.info("start a task that will store its results"); + TestTaskPlugin.NodesRequest req = new TestTaskPlugin.NodesRequest("foo"); + req.setShouldStoreResult(true); + req.setShouldBlock(false); + task = nodeClient().executeLocally(TestTaskPlugin.TestTaskAction.INSTANCE, req, future); + + logger.info("verify that the task has started and is still running"); + assertBusy(() -> { + GetTaskResponse runningTask = client().admin().cluster() + .prepareGetTask(new TaskId(nodeClient().getLocalNodeId(), task.getId())) + .get(); + assertNotNull(runningTask.getTask()); + assertFalse(runningTask.getTask().isCompleted()); + assertEquals(emptyMap(), runningTask.getTask().getErrorAsMap()); + assertEquals(emptyMap(), runningTask.getTask().getResponseAsMap()); + assertFalse(future.isDone()); + }); + } finally { + logger.info("unblock the write executor"); + barrier.await(); + } + + logger.info("wait for the task to finish"); + future.get(10, TimeUnit.SECONDS); + + logger.info("check that it was written successfully"); + GetTaskResponse finishedTask = client().admin().cluster() + .prepareGetTask(new TaskId(nodeClient().getLocalNodeId(), task.getId())) + .get(); + assertTrue(finishedTask.getTask().isCompleted()); + assertEquals(emptyMap(), finishedTask.getTask().getErrorAsMap()); + assertEquals(singletonMap("failure_count", 0), + finishedTask.getTask().getResponseAsMap()); + } + + /** + * Get the {@linkplain NodeClient} local to the node being tested. + */ + private NodeClient nodeClient() { + /* + * Luckilly our test infrastructure already returns it, but we can't + * change the return type in the superclass because it is wrapped other + * places. + */ + return (NodeClient) client(); + } +} + diff --git a/server/src/test/java/org/elasticsearch/tasks/TaskResultsServiceTests.java b/server/src/test/java/org/elasticsearch/tasks/TaskResultsServiceTests.java new file mode 100644 index 0000000000000..7c896cb21595d --- /dev/null +++ b/server/src/test/java/org/elasticsearch/tasks/TaskResultsServiceTests.java @@ -0,0 +1,40 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.tasks; + +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.test.ESTestCase; + +import java.util.Iterator; + +/** + * Makes sure that tasks that attempt to store themselves on completion retry if + * they don't succeed at first. + */ +public class TaskResultsServiceTests extends ESTestCase { + public void testRetryTotalTime() { + Iterator times = TaskResultsService.STORE_BACKOFF_POLICY.iterator(); + long total = 0; + while (times.hasNext()) { + total += times.next().millis(); + } + assertEquals(600000L, total); + } +} \ No newline at end of file