Skip to content

Commit

Permalink
Tasks: Retry if task can't be written (#35054)
Browse files Browse the repository at this point in the history
Adds about a minute worth of backoffs and retries to saving task
results so it is *much* more likely that a busy cluster won't lose task
results. This isn't an ideal solution to losing task results, but it is
an incremental improvement. If all of the retries fail when still log
the task result, but that is far from ideal.

Closes #33764
  • Loading branch information
nik9000 committed Dec 3, 2018
1 parent c6cf11e commit 234e70d
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
import org.elasticsearch.action.admin.indices.create.TransportCreateIndexAction;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
Expand All @@ -40,18 +40,23 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.core.internal.io.Streams;
import org.elasticsearch.threadpool.ThreadPool;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Map;

import static org.elasticsearch.common.unit.TimeValue.timeValueMillis;
import static org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskAction.TASKS_ORIGIN;

/**
Expand All @@ -71,18 +76,24 @@ public class TaskResultsService {

public static final int TASK_RESULT_MAPPING_VERSION = 2;

/**
* The backoff policy to use when saving a task result fails. The total wait
* time is 600000 milliseconds, ten minutes.
*/
static final BackoffPolicy STORE_BACKOFF_POLICY =
BackoffPolicy.exponentialBackoff(timeValueMillis(250), 14);

private final Client client;

private final ClusterService clusterService;

private final TransportCreateIndexAction createIndexAction;
private final ThreadPool threadPool;

@Inject
public TaskResultsService(Client client, ClusterService clusterService,
TransportCreateIndexAction createIndexAction) {
public TaskResultsService(Client client, ClusterService clusterService, ThreadPool threadPool) {
this.client = new OriginSettingClient(client, TASKS_ORIGIN);
this.clusterService = clusterService;
this.createIndexAction = createIndexAction;
this.threadPool = threadPool;
}

public void storeResult(TaskResult taskResult, ActionListener<Void> listener) {
Expand Down Expand Up @@ -161,6 +172,10 @@ private void doStoreResult(TaskResult taskResult, ActionListener<Void> listener)
} catch (IOException e) {
throw new ElasticsearchException("Couldn't convert task result to XContent for [{}]", e, taskResult.getTask());
}
doStoreResult(STORE_BACKOFF_POLICY.iterator(), index, listener);
}

private void doStoreResult(Iterator<TimeValue> backoff, IndexRequestBuilder index, ActionListener<Void> listener) {
index.execute(new ActionListener<IndexResponse>() {
@Override
public void onResponse(IndexResponse indexResponse) {
Expand All @@ -169,7 +184,14 @@ public void onResponse(IndexResponse indexResponse) {

@Override
public void onFailure(Exception e) {
listener.onFailure(e);
if (false == (e instanceof EsRejectedExecutionException)
|| false == backoff.hasNext()) {
listener.onFailure(e);
} else {
TimeValue wait = backoff.next();
logger.warn(() -> new ParameterizedMessage("failed to store task result, retrying in [{}]", wait), e);
threadPool.schedule(wait, ThreadPool.Names.SAME, () -> doStoreResult(backoff, index, listener));
}
}
});
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.action.admin.cluster.node.tasks;

import org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskResponse;
import org.elasticsearch.action.support.PlainListenableActionFuture;
import org.elasticsearch.client.node.NodeClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.tasks.TaskId;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.threadpool.ThreadPool;

import java.util.Arrays;
import java.util.Collection;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.TimeUnit;

import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonMap;

/**
* Makes sure that tasks that attempt to store themselves on completion retry if
* they don't succeed at first.
*/
public class TaskStorageRetryIT extends ESSingleNodeTestCase {
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
return Arrays.asList(TestTaskPlugin.class);
}

/**
* Lower the queue sizes to be small enough that both bulk and searches will time out and have to be retried.
*/
@Override
protected Settings nodeSettings() {
return Settings.builder()
.put(super.nodeSettings())
.put("thread_pool.write.size", 2)
.put("thread_pool.write.queue_size", 0)
.build();
}

public void testRetry() throws Exception {
logger.info("block the write executor");
CyclicBarrier barrier = new CyclicBarrier(2);
getInstanceFromNode(ThreadPool.class).executor(ThreadPool.Names.WRITE).execute(() -> {
try {
barrier.await();
logger.info("blocking the write executor");
barrier.await();
logger.info("unblocked the write executor");
} catch (Exception e) {
throw new RuntimeException(e);
}
});
barrier.await();
Task task;
PlainListenableActionFuture<TestTaskPlugin.NodesResponse> future =
PlainListenableActionFuture.newListenableFuture();
try {
logger.info("start a task that will store its results");
TestTaskPlugin.NodesRequest req = new TestTaskPlugin.NodesRequest("foo");
req.setShouldStoreResult(true);
req.setShouldBlock(false);
task = nodeClient().executeLocally(TestTaskPlugin.TestTaskAction.INSTANCE, req, future);

logger.info("verify that the task has started and is still running");
assertBusy(() -> {
GetTaskResponse runningTask = client().admin().cluster()
.prepareGetTask(new TaskId(nodeClient().getLocalNodeId(), task.getId()))
.get();
assertNotNull(runningTask.getTask());
assertFalse(runningTask.getTask().isCompleted());
assertEquals(emptyMap(), runningTask.getTask().getErrorAsMap());
assertEquals(emptyMap(), runningTask.getTask().getResponseAsMap());
assertFalse(future.isDone());
});
} finally {
logger.info("unblock the write executor");
barrier.await();
}

logger.info("wait for the task to finish");
future.get(10, TimeUnit.SECONDS);

logger.info("check that it was written successfully");
GetTaskResponse finishedTask = client().admin().cluster()
.prepareGetTask(new TaskId(nodeClient().getLocalNodeId(), task.getId()))
.get();
assertTrue(finishedTask.getTask().isCompleted());
assertEquals(emptyMap(), finishedTask.getTask().getErrorAsMap());
assertEquals(singletonMap("failure_count", 0),
finishedTask.getTask().getResponseAsMap());
}

/**
* Get the {@linkplain NodeClient} local to the node being tested.
*/
private NodeClient nodeClient() {
/*
* Luckilly our test infrastructure already returns it, but we can't
* change the return type in the superclass because it is wrapped other
* places.
*/
return (NodeClient) client();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.tasks;

import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.ESTestCase;

import java.util.Iterator;

/**
* Makes sure that tasks that attempt to store themselves on completion retry if
* they don't succeed at first.
*/
public class TaskResultsServiceTests extends ESTestCase {
public void testRetryTotalTime() {
Iterator<TimeValue> times = TaskResultsService.STORE_BACKOFF_POLICY.iterator();
long total = 0;
while (times.hasNext()) {
total += times.next().millis();
}
assertEquals(600000L, total);
}
}

0 comments on commit 234e70d

Please sign in to comment.