elastic · ywangd · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/docs/changelog/99797.yaml b/docs/changelog/99797.yaml
@@ -0,0 +1,5 @@
+pr: 99797
+summary: Wait for cluster to recover before resolving index template
+area: CRUD
+type: bug
+issues: []
diff --git a/...-common/src/internalClusterTest/java/org/elasticsearch/ingest/common/IngestRestartIT.java b/...-common/src/internalClusterTest/java/org/elasticsearch/ingest/common/IngestRestartIT.java
@@ -7,15 +7,22 @@
  */
 package org.elasticsearch.ingest.common;
 
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.DocWriteResponse;
 import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
+import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.cluster.block.ClusterBlockException;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.core.Strings;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.gateway.GatewayService;
 import org.elasticsearch.ingest.IngestStats;
 import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.script.MockScriptEngine;
 import org.elasticsearch.script.MockScriptPlugin;
 import org.elasticsearch.test.ESIntegTestCase;
@@ -24,8 +31,11 @@
 
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 import java.util.function.Consumer;
 import java.util.function.Function;
 
@@ -262,4 +272,95 @@ public void testWithDedicatedIngestNode() throws Exception {
         assertThat(source.get("y"), equalTo(0));
     }
 
+    public void testDefaultPipelineWaitForClusterStateRecovered() throws Exception {
+        internalCluster().startNode();
+
+        final var pipeline = new BytesArray("""
+            {
+              "processors" : [
+                {
+                  "set": {
+                    "field": "value",
+                    "value": 42
+                  }
+                }
+              ]
+            }""");
+        client().admin().cluster().preparePutPipeline("test_pipeline", pipeline, XContentType.JSON).get();
+        client().admin().indices().preparePutTemplate("pipeline_template").setPatterns(Collections.singletonList("*")).setSettings("""
+            {
+              "index" : {
+                 "default_pipeline" : "test_pipeline"
+              }
+            }
+            """, XContentType.JSON).get();
+
+        internalCluster().fullRestart(new InternalTestCluster.RestartCallback() {
+            @Override
+            public Settings onNodeStopped(String nodeName) {
+                return Settings.builder().put(GatewayService.RECOVER_AFTER_DATA_NODES_SETTING.getKey(), "2").build();
+            }
+
+            @Override
+            public boolean validateClusterForming() {
+                return false;
+            }
+        });
+
+        // this one should fail
+        assertThat(
+            expectThrows(
+                ClusterBlockException.class,
+                () -> client().prepareIndex("index")
+                    .setId("fails")
+                    .setSource("x", 1)
+                    .setTimeout(TimeValue.timeValueMillis(100)) // 100ms, to fail quickly
+                    .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
+                    .get()
+            ).getMessage(),
+            equalTo("blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];")
+        );
+
+        final var latch = new CountDownLatch(1);
+        // but this one should pass since it has a longer timeout
+        client().prepareIndex("index")
+            .setId("passes1")
+            .setSource("x", 2)
+            .setTimeout(TimeValue.timeValueSeconds(60)) // wait for second node to start in below
+            .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
+            .execute(new ActionListener<>() {
+                @Override
+                public void onResponse(IndexResponse indexResponse) {
+                    assertThat(indexResponse.status(), equalTo(RestStatus.CREATED));
+                    assertThat(indexResponse.getResult(), equalTo(DocWriteResponse.Result.CREATED));
+                    latch.countDown();
+                }
+
+                @Override
+                public void onFailure(Exception e) {
+                    fail("Should not have failed with exception: " + e.getMessage());
+                }
+            });
+
+        // so the cluster state can be recovered
+        internalCluster().startNode(Settings.builder().put(GatewayService.RECOVER_AFTER_DATA_NODES_SETTING.getKey(), "1"));
+        ensureYellow("index");
+        assertTrue(latch.await(5, TimeUnit.SECONDS));
-        assertTrue(latch.await(5, TimeUnit.SECONDS));
+        safeAwait(latch);
-        assertTrue(latch.await(5, TimeUnit.SECONDS));
+        safeAwait(latch);
+
+        client().prepareIndex("index").setId("passes2").setSource("x", 3).setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get();
+        client().admin().indices().prepareRefresh("index").get();
+
+        // successfully indexed documents should have the value field set by the pipeline
+        Map<String, Object> source = client().prepareGet("index", "passes1").get().getSource();
+        assertThat(source.get("x"), equalTo(2));
+        assertThat(source.get("value"), equalTo(42));
+
+        source = client().prepareGet("index", "passes2").get().getSource();
+        assertThat(source.get("x"), equalTo(3));
+        assertThat(source.get("value"), equalTo(42));
+
+        // and make sure this failed doc didn't get through
+        source = client().prepareGet("index", "fails").get().getSource();
+        assertNull(source);
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/action/bulk/TransportBulkAction.java b/server/src/main/java/org/elasticsearch/action/bulk/TransportBulkAction.java
@@ -191,9 +191,47 @@ public static <Response extends ReplicationResponse & WriteResponse> ActionListe
 
     @Override
     protected void doExecute(Task task, BulkRequest bulkRequest, ActionListener<BulkResponse> listener) {
+        final long startTime = relativeTime();
+        final ClusterState initialState = clusterService.state();
+        final ClusterBlockException blockException = initialState.blocks().globalBlockedException(ClusterBlockLevel.WRITE);
+        if (blockException != null) {
+            if (false == blockException.retryable()) {
+                listener.onFailure(blockException);
+                return;
+            }
+            logger.trace("cluster is blocked, waiting for it to recover", blockException);
+            final ClusterStateObserver clusterStateObserver = new ClusterStateObserver(
+                initialState,
+                clusterService,
+                bulkRequest.timeout(),
+                logger,
+                threadPool.getThreadContext()
+            );
+            clusterStateObserver.waitForNextChange(new ClusterStateObserver.Listener() {
+                @Override
+                public void onNewClusterState(ClusterState state) {
+                    doExecuteOnWriteThreadPool(task, bulkRequest, startTime, listener);
+                }
+
+                @Override
+                public void onClusterServiceClose() {
+                    listener.onFailure(new NodeClosedException(clusterService.localNode()));
+                }
+
+                @Override
+                public void onTimeout(TimeValue timeout) {
+                    listener.onFailure(blockException);
+                }
+            }, newState -> false == newState.blocks().hasGlobalBlockWithLevel(ClusterBlockLevel.WRITE));
+        } else {
+            doExecuteOnWriteThreadPool(task, bulkRequest, startTime, listener);
+        }
+    }
+
+    private void doExecuteOnWriteThreadPool(Task task, BulkRequest bulkRequest, long startTime, ActionListener<BulkResponse> listener) {
         /*
-         * This is called on the Transport tread so we can check the indexing
-         * memory pressure *quickly* but we don't want to keep the transport
+         * This is called on the Transport thread and sometimes on the cluster state applier thread,
+         * so we can check the indexing memory pressure *quickly* but we don't want to keep the transport
          * thread busy. Then, as soon as we have the indexing pressure in we fork
          * to one of the write thread pools. We do this because juggling the
          * bulk request can get expensive for a few reasons:
@@ -209,19 +247,24 @@ protected void doExecute(Task task, BulkRequest bulkRequest, ActionListener<Bulk
         final int indexingOps = bulkRequest.numberOfActions();
         final long indexingBytes = bulkRequest.ramBytesUsed();
         final boolean isOnlySystem = isOnlySystem(bulkRequest, clusterService.state().metadata().getIndicesLookup(), systemIndices);
+        final String executorName = isOnlySystem ? Names.SYSTEM_WRITE : Names.WRITE;
         final Releasable releasable = indexingPressure.markCoordinatingOperationStarted(indexingOps, indexingBytes, isOnlySystem);
         final ActionListener<BulkResponse> releasingListener = ActionListener.runBefore(listener, releasable::close);
-        final String executorName = isOnlySystem ? Names.SYSTEM_WRITE : Names.WRITE;
         threadPool.executor(Names.WRITE).execute(new ActionRunnable<>(releasingListener) {
             @Override
             protected void doRun() {
-                doInternalExecute(task, bulkRequest, executorName, releasingListener);
+                doInternalExecute(task, bulkRequest, executorName, startTime, releasingListener);
             }
         });
     }
 
-    protected void doInternalExecute(Task task, BulkRequest bulkRequest, String executorName, ActionListener<BulkResponse> listener) {
-        final long startTime = relativeTime();
+    protected void doInternalExecute(
+        Task task,
+        BulkRequest bulkRequest,
+        String executorName,
+        long startTime,
+        ActionListener<BulkResponse> listener
+    ) {
         final AtomicArray<BulkItemResponse> responses = new AtomicArray<>(bulkRequest.requests.size());
 
         boolean hasIndexRequestsWithPipelines = false;
@@ -256,7 +299,7 @@ protected void doInternalExecute(Task task, BulkRequest bulkRequest, String exec
                     assert arePipelinesResolved : bulkRequest;
                 }
                 if (clusterService.localNode().isIngestNode()) {
-                    processBulkIndexIngestRequest(task, bulkRequest, executorName, l);
+                    processBulkIndexIngestRequest(task, bulkRequest, executorName, startTime, l);
                 } else {
                     ingestForwarder.forwardIngestRequest(BulkAction.INSTANCE, bulkRequest, l);
                 }
@@ -759,6 +802,7 @@ private void processBulkIndexIngestRequest(
         Task task,
         BulkRequest original,
         String executorName,
+        long startTime,
         ActionListener<BulkResponse> listener
     ) {
         final long ingestStartTimeInNanos = System.nanoTime();
@@ -788,7 +832,7 @@ private void processBulkIndexIngestRequest(
                         ActionRunnable<BulkResponse> runnable = new ActionRunnable<>(actionListener) {
                             @Override
                             protected void doRun() {
-                                doInternalExecute(task, bulkRequest, executorName, actionListener);
+                                doInternalExecute(task, bulkRequest, executorName, startTime, actionListener);
                             }
 
                             @Override