From da851244a273d6fbfc42eb115f2dcfca1c3cc728 Mon Sep 17 00:00:00 2001
From: David Z <38449481+dzane17@users.noreply.github.com>
Date: Mon, 23 Oct 2023 11:01:47 -0700
Subject: [PATCH 01/33] Update per request latency check to V_2_12_0 (#10865)

Signed-off-by: David Zane <davizane@amazon.com>
---
 .../main/java/org/opensearch/action/search/SearchRequest.java | 4 ++--
 .../java/org/opensearch/action/search/SearchResponse.java     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/server/src/main/java/org/opensearch/action/search/SearchRequest.java b/server/src/main/java/org/opensearch/action/search/SearchRequest.java
index 9e50213eab5f9..fb026dae630b7 100644
--- a/server/src/main/java/org/opensearch/action/search/SearchRequest.java
+++ b/server/src/main/java/org/opensearch/action/search/SearchRequest.java
@@ -256,7 +256,7 @@ public SearchRequest(StreamInput in) throws IOException {
         if (in.getVersion().onOrAfter(Version.V_2_7_0)) {
             pipeline = in.readOptionalString();
         }
-        if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (in.getVersion().onOrAfter(Version.V_2_12_0)) {
             phaseTook = in.readOptionalBoolean();
         }
     }
@@ -290,7 +290,7 @@ public void writeTo(StreamOutput out) throws IOException {
         if (out.getVersion().onOrAfter(Version.V_2_7_0)) {
             out.writeOptionalString(pipeline);
         }
-        if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (out.getVersion().onOrAfter(Version.V_2_12_0)) {
             out.writeOptionalBoolean(phaseTook);
         }
     }
diff --git a/server/src/main/java/org/opensearch/action/search/SearchResponse.java b/server/src/main/java/org/opensearch/action/search/SearchResponse.java
index 91f0dc0737637..96d07982d03db 100644
--- a/server/src/main/java/org/opensearch/action/search/SearchResponse.java
+++ b/server/src/main/java/org/opensearch/action/search/SearchResponse.java
@@ -116,7 +116,7 @@ public SearchResponse(StreamInput in) throws IOException {
         clusters = new Clusters(in);
         scrollId = in.readOptionalString();
         tookInMillis = in.readVLong();
-        if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (in.getVersion().onOrAfter(Version.V_2_12_0)) {
             phaseTook = in.readOptionalWriteable(PhaseTook::new);
         } else {
             phaseTook = null;
@@ -557,7 +557,7 @@ public void writeTo(StreamOutput out) throws IOException {
         clusters.writeTo(out);
         out.writeOptionalString(scrollId);
         out.writeVLong(tookInMillis);
-        if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (out.getVersion().onOrAfter(Version.V_2_12_0)) {
             out.writeOptionalWriteable(phaseTook);
         }
         out.writeVInt(skippedShards);

From 8b2173910f754a48773b3283e1a511cbc1a9db78 Mon Sep 17 00:00:00 2001
From: Poojita Raj <poojiraj@amazon.com>
Date: Mon, 23 Oct 2023 16:08:04 -0700
Subject: [PATCH 02/33] =?UTF-8?q?Add=20cluster=20setting=20cluster.restric?=
 =?UTF-8?q?t.index.replication=5Ftype=20t=E2=80=A6=20(#10866)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add cluster setting CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING to restrict setting of index setting replication type

Signed-off-by: Poojita Raj <poojiraj@amazon.com>

* Add Changelog entry

Signed-off-by: Poojita Raj <poojiraj@amazon.com>

* refactoring

Signed-off-by: Poojita Raj <poojiraj@amazon.com>

---------

Signed-off-by: Poojita Raj <poojiraj@amazon.com>
---
 CHANGELOG.md                                  |  3 ++-
 .../SegmentReplicationClusterSettingIT.java   | 27 +++++++++++++++++++
 .../metadata/MetadataCreateIndexService.java  | 19 +++++++++++++
 .../common/settings/ClusterSettings.java      |  3 ++-
 .../opensearch/indices/IndicesService.java    | 11 ++++++++
 .../MetadataCreateIndexServiceTests.java      | 16 +++++++++--
 6 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c4563a216974..0b0d9720ad208 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -95,6 +95,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add search query categorizer ([#10255](https://github.com/opensearch-project/OpenSearch/pull/10255))
 - Introduce ConcurrentQueryProfiler to profile query using concurrent segment search path and support concurrency during rewrite and create weight ([10352](https://github.com/opensearch-project/OpenSearch/pull/10352))
 - [Remote cluster state] Make index and global metadata upload timeout dynamic cluster settings ([#10814](https://github.com/opensearch-project/OpenSearch/pull/10814))
+- Added cluster setting cluster.restrict.index.replication_type to restrict setting of index setting replication type ([#10866](https://github.com/opensearch-project/OpenSearch/pull/10866))
 
 ### Dependencies
 - Bump `com.google.api.grpc:proto-google-common-protos` from 2.10.0 to 2.25.1 ([#10208](https://github.com/opensearch-project/OpenSearch/pull/10208), [#10298](https://github.com/opensearch-project/OpenSearch/pull/10298))
@@ -131,4 +132,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Security
 
 [Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.x...HEAD
-[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x
\ No newline at end of file
+[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x
diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationClusterSettingIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationClusterSettingIT.java
index a82fd8d845709..186a5ce39f131 100644
--- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationClusterSettingIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationClusterSettingIT.java
@@ -19,6 +19,7 @@
 import org.opensearch.test.OpenSearchIntegTestCase;
 
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REPLICATION_TYPE;
+import static org.opensearch.indices.IndicesService.CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING;
 import static org.opensearch.indices.IndicesService.CLUSTER_SETTING_REPLICATION_TYPE;
 
 @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
@@ -123,4 +124,30 @@ public void testIndexReplicationSettingOverridesDocRepClusterSetting() throws Ex
         assertEquals(indicesService.indexService(anotherIndex).getIndexSettings().isSegRepEnabled(), false);
     }
 
+    public void testIndexReplicationTypeWhenRestrictSettingTrue() {
+        testRestrictIndexReplicationTypeSetting(true, randomFrom(ReplicationType.values()));
+    }
+
+    public void testIndexReplicationTypeWhenRestrictSettingFalse() {
+        testRestrictIndexReplicationTypeSetting(false, randomFrom(ReplicationType.values()));
+    }
+
+    private void testRestrictIndexReplicationTypeSetting(boolean setRestrict, ReplicationType replicationType) {
+        String expectedExceptionMsg =
+            "Validation Failed: 1: index setting [index.replication.type] is not allowed to be set as [cluster.restrict.index.replication_type=true];";
+        String clusterManagerName = internalCluster().startNode(
+            Settings.builder().put(CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING.getKey(), setRestrict).build()
+        );
+        internalCluster().startDataOnlyNodes(1);
+
+        // Test create index fails
+        Settings indexSettings = Settings.builder().put(indexSettings()).put(SETTING_REPLICATION_TYPE, replicationType).build();
+        if (setRestrict) {
+            IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> createIndex(INDEX_NAME, indexSettings));
+            assertEquals(expectedExceptionMsg, exception.getMessage());
+        } else {
+            createIndex(INDEX_NAME, indexSettings);
+        }
+    }
+
 }
diff --git a/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java b/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java
index 8d76a39712ee3..78a22fe11f072 100644
--- a/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java
+++ b/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java
@@ -1252,6 +1252,7 @@ List<String> getIndexSettingsValidationErrors(
         if (forbidPrivateIndexSettings) {
             validationErrors.addAll(validatePrivateSettingsNotExplicitlySet(settings, indexScopedSettings));
         }
+        validateIndexReplicationTypeSettings(settings, clusterService.getClusterSettings()).ifPresent(validationErrors::add);
         if (indexName.isEmpty() || indexName.get().charAt(0) != '.') {
             // Apply aware replica balance validation only to non system indices
             int replicaCount = settings.getAsInt(
@@ -1306,6 +1307,24 @@ private static List<String> validateIndexCustomPath(Settings settings, @Nullable
         return validationErrors;
     }
 
+    /**
+     * Validates {@code index.replication.type} is not set if {@code cluster.restrict.index.replication_type} is set to true.
+     *
+     * @param requestSettings settings passed in during index create request
+     * @param clusterSettings cluster setting
+     */
+    private static Optional<String> validateIndexReplicationTypeSettings(Settings requestSettings, ClusterSettings clusterSettings) {
+        if (requestSettings.hasValue(SETTING_REPLICATION_TYPE)
+            && clusterSettings.get(IndicesService.CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING)) {
+            return Optional.of(
+                "index setting [index.replication.type] is not allowed to be set as ["
+                    + IndicesService.CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING.getKey()
+                    + "=true]"
+            );
+        }
+        return Optional.empty();
+    }
+
     /**
      * Validates the settings and mappings for shrinking an index.
      *
diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
index a0fca4f0a2ff0..c2c6effc3336f 100644
--- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
@@ -690,7 +690,8 @@ public void apply(Settings value, Settings current, Settings previous) {
                 AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,
                 CPUBasedAdmissionControllerSettings.CPU_BASED_ADMISSION_CONTROLLER_TRANSPORT_LAYER_MODE,
                 CPUBasedAdmissionControllerSettings.INDEXING_CPU_USAGE_LIMIT,
-                CPUBasedAdmissionControllerSettings.SEARCH_CPU_USAGE_LIMIT
+                CPUBasedAdmissionControllerSettings.SEARCH_CPU_USAGE_LIMIT,
+                IndicesService.CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING
             )
         )
     );
diff --git a/server/src/main/java/org/opensearch/indices/IndicesService.java b/server/src/main/java/org/opensearch/indices/IndicesService.java
index 50c551c2be29b..36abc77893d81 100644
--- a/server/src/main/java/org/opensearch/indices/IndicesService.java
+++ b/server/src/main/java/org/opensearch/indices/IndicesService.java
@@ -299,6 +299,17 @@ public class IndicesService extends AbstractLifecycleComponent
         Property.Final
     );
 
+    /**
+     * This setting is used to restrict creation of index where the 'index.replication.type' index setting is set.
+     * If disabled, the replication type can be specified.
+     */
+    public static final Setting<Boolean> CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING = Setting.boolSetting(
+        "cluster.restrict.index.replication_type",
+        false,
+        Property.NodeScope,
+        Property.Final
+    );
+
     /**
      * The node's settings.
      */
diff --git a/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java b/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java
index e40826915c848..cace66d8c6d9e 100644
--- a/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java
+++ b/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java
@@ -139,6 +139,7 @@
 import static org.opensearch.indices.IndicesService.CLUSTER_MINIMUM_INDEX_REFRESH_INTERVAL_SETTING;
 import static org.opensearch.indices.IndicesService.CLUSTER_REMOTE_INDEX_RESTRICT_ASYNC_DURABILITY_SETTING;
 import static org.opensearch.indices.IndicesService.CLUSTER_REPLICATION_TYPE_SETTING;
+import static org.opensearch.indices.IndicesService.CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING;
 import static org.opensearch.indices.ShardLimitValidatorTests.createTestShardLimitService;
 import static org.opensearch.node.Node.NODE_ATTRIBUTES;
 import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_SEGMENT_REPOSITORY_NAME_ATTRIBUTE_KEY;
@@ -1177,6 +1178,8 @@ public void testvalidateIndexSettings() {
             .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.getKey() + "zone.values", "a, b")
             .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.getKey() + "rack.values", "c, d, e")
             .put(AwarenessReplicaBalance.CLUSTER_ROUTING_ALLOCATION_AWARENESS_BALANCE_SETTING.getKey(), true)
+            .put(CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING.getKey(), true)
+            .put(SETTING_REPLICATION_TYPE, randomFrom(ReplicationType.values()))
             .build();
         ClusterSettings clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         when(clusterService.getSettings()).thenReturn(settings);
@@ -1200,8 +1203,12 @@ public void testvalidateIndexSettings() {
         );
 
         List<String> validationErrors = checkerService.getIndexSettingsValidationErrors(settings, false, Optional.empty());
-        assertThat(validationErrors.size(), is(1));
-        assertThat(validationErrors.get(0), is("expected total copies needs to be a multiple of total awareness attributes [3]"));
+        assertThat(validationErrors.size(), is(2));
+        assertThat(
+            validationErrors.get(0),
+            is("index setting [index.replication.type] is not allowed to be set as [cluster.restrict.index.replication_type=true]")
+        );
+        assertThat(validationErrors.get(1), is("expected total copies needs to be a multiple of total awareness attributes [3]"));
 
         settings = Settings.builder()
             .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone, rack")
@@ -1209,8 +1216,13 @@ public void testvalidateIndexSettings() {
             .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.getKey() + "rack.values", "c, d, e")
             .put(AwarenessReplicaBalance.CLUSTER_ROUTING_ALLOCATION_AWARENESS_BALANCE_SETTING.getKey(), true)
             .put(SETTING_NUMBER_OF_REPLICAS, 2)
+            .put(CLUSTER_RESTRICT_INDEX_REPLICATION_TYPE_SETTING.getKey(), false)
+            .put(SETTING_REPLICATION_TYPE, randomFrom(ReplicationType.values()))
             .build();
 
+        clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        when(clusterService.getClusterSettings()).thenReturn(clusterSettings);
+
         validationErrors = checkerService.getIndexSettingsValidationErrors(settings, false, Optional.empty());
         assertThat(validationErrors.size(), is(0));
 

From 5a288ed3ecf06a048eba6028c445aa4e642d39c5 Mon Sep 17 00:00:00 2001
From: Dhwanil Patel <dhwanip@amazon.com>
Date: Tue, 24 Oct 2023 09:02:48 +0530
Subject: [PATCH 03/33] Changing version check to 2.12 for remote cluster state
 (#10844)

Signed-off-by: Dhwanil Patel <dhwanip@amazon.com>
---
 .../opensearch/gateway/remote/ClusterMetadataManifest.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/gateway/remote/ClusterMetadataManifest.java b/server/src/main/java/org/opensearch/gateway/remote/ClusterMetadataManifest.java
index 97b37d9532f85..4725f40076ce2 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/ClusterMetadataManifest.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/ClusterMetadataManifest.java
@@ -262,7 +262,7 @@ public ClusterMetadataManifest(StreamInput in) throws IOException {
         this.indices = Collections.unmodifiableList(in.readList(UploadedIndexMetadata::new));
         this.previousClusterUUID = in.readString();
         this.clusterUUIDCommitted = in.readBoolean();
-        if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (in.getVersion().onOrAfter(Version.V_2_12_0)) {
             this.codecVersion = in.readInt();
             this.globalMetadataFileName = in.readString();
         } else {
@@ -316,7 +316,7 @@ public void writeTo(StreamOutput out) throws IOException {
         out.writeCollection(indices);
         out.writeString(previousClusterUUID);
         out.writeBoolean(clusterUUIDCommitted);
-        if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (out.getVersion().onOrAfter(Version.V_2_12_0)) {
             out.writeInt(codecVersion);
             out.writeString(globalMetadataFileName);
         }

From a2b5f0e3a9515f101653afc88dd98e48015f6346 Mon Sep 17 00:00:00 2001
From: Shivansh Arora <31575408+shiv0408@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:58:32 +0530
Subject: [PATCH 04/33] Created new urgent priority threadpool for remote
 cluster state uploads (#10685)

* Created new Urgent WritePriority and a new threadpools and S3Client for this priority which is being used to upload cluster state

Signed-off-by: Shivansh Arora <shivansh.arora@protonmail.com>
---
 .../s3/S3BlobStoreRepositoryTests.java        |  2 +-
 .../s3/AmazonAsyncS3Reference.java            |  1 +
 .../s3/AmazonAsyncS3WithCredentials.java      | 10 ++-
 .../repositories/s3/S3AsyncService.java       | 17 +++-
 .../repositories/s3/S3BlobContainer.java      | 11 ++-
 .../repositories/s3/S3BlobStore.java          |  5 +-
 .../repositories/s3/S3Repository.java         |  6 ++
 .../repositories/s3/S3RepositoryPlugin.java   | 22 ++++-
 .../s3/async/AsyncPartsHandler.java           | 15 +++-
 .../s3/async/AsyncTransferManager.java        | 21 ++++-
 .../s3/RepositoryCredentialsTests.java        |  2 +-
 .../repositories/s3/S3AsyncServiceTests.java  |  8 +-
 .../s3/S3BlobContainerMockClientTests.java    |  5 +-
 .../s3/S3BlobContainerRetriesTests.java       |  2 +
 .../s3/S3BlobStoreContainerTests.java         |  8 +-
 .../repositories/s3/S3RepositoryTests.java    |  1 +
 .../s3/async/AsyncTransferManagerTests.java   |  1 +
 .../blobstore/stream/write/WritePriority.java |  3 +-
 .../remote/RemoteClusterStateService.java     |  4 +-
 .../blobstore/ChecksumBlobStoreFormat.java    | 45 ++++++++--
 .../RemoteClusterStateServiceTests.java       |  2 +-
 .../snapshots/BlobStoreFormatTests.java       | 87 +++++++++++++++----
 22 files changed, 224 insertions(+), 54 deletions(-)

diff --git a/plugins/repository-s3/src/internalClusterTest/java/org/opensearch/repositories/s3/S3BlobStoreRepositoryTests.java b/plugins/repository-s3/src/internalClusterTest/java/org/opensearch/repositories/s3/S3BlobStoreRepositoryTests.java
index 4df30bfd2169e..da2c6e8c1b0ee 100644
--- a/plugins/repository-s3/src/internalClusterTest/java/org/opensearch/repositories/s3/S3BlobStoreRepositoryTests.java
+++ b/plugins/repository-s3/src/internalClusterTest/java/org/opensearch/repositories/s3/S3BlobStoreRepositoryTests.java
@@ -249,7 +249,7 @@ protected S3Repository createRepository(
             ClusterService clusterService,
             RecoverySettings recoverySettings
         ) {
-            return new S3Repository(metadata, registry, service, clusterService, recoverySettings, null, null, null, null, false) {
+            return new S3Repository(metadata, registry, service, clusterService, recoverySettings, null, null, null, null, null, false) {
 
                 @Override
                 public BlobStore blobStore() {
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3Reference.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3Reference.java
index 0b5fcb6df280e..45170ea1ad209 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3Reference.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3Reference.java
@@ -29,6 +29,7 @@ public class AmazonAsyncS3Reference extends RefCountedReleasable<AmazonAsyncS3Wi
         super("AWS_S3_CLIENT", client, () -> {
             client.client().close();
             client.priorityClient().close();
+            client.urgentClient().close();
             AwsCredentialsProvider credentials = client.credentials();
             if (credentials instanceof Closeable) {
                 try {
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3WithCredentials.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3WithCredentials.java
index fa2db83729d25..f8a313b55d945 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3WithCredentials.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/AmazonAsyncS3WithCredentials.java
@@ -19,16 +19,19 @@
 final class AmazonAsyncS3WithCredentials {
     private final S3AsyncClient client;
     private final S3AsyncClient priorityClient;
+    private final S3AsyncClient urgentClient;
     private final AwsCredentialsProvider credentials;
 
     private AmazonAsyncS3WithCredentials(
         final S3AsyncClient client,
         final S3AsyncClient priorityClient,
+        final S3AsyncClient urgentClient,
         @Nullable final AwsCredentialsProvider credentials
     ) {
         this.client = client;
         this.credentials = credentials;
         this.priorityClient = priorityClient;
+        this.urgentClient = urgentClient;
     }
 
     S3AsyncClient client() {
@@ -39,6 +42,10 @@ S3AsyncClient priorityClient() {
         return priorityClient;
     }
 
+    S3AsyncClient urgentClient() {
+        return urgentClient;
+    }
+
     AwsCredentialsProvider credentials() {
         return credentials;
     }
@@ -46,8 +53,9 @@ AwsCredentialsProvider credentials() {
     static AmazonAsyncS3WithCredentials create(
         final S3AsyncClient client,
         final S3AsyncClient priorityClient,
+        final S3AsyncClient urgentClient,
         @Nullable final AwsCredentialsProvider credentials
     ) {
-        return new AmazonAsyncS3WithCredentials(client, priorityClient, credentials);
+        return new AmazonAsyncS3WithCredentials(client, priorityClient, urgentClient, credentials);
     }
 }
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3AsyncService.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3AsyncService.java
index 08215ebdd45e0..262304029a0d3 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3AsyncService.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3AsyncService.java
@@ -103,6 +103,7 @@ public synchronized void refreshAndClearCache(Map<String, S3ClientSettings> clie
      */
     public AmazonAsyncS3Reference client(
         RepositoryMetadata repositoryMetadata,
+        AsyncExecutorContainer urgentExecutorBuilder,
         AsyncExecutorContainer priorityExecutorBuilder,
         AsyncExecutorContainer normalExecutorBuilder
     ) {
@@ -119,7 +120,7 @@ public AmazonAsyncS3Reference client(
                 return existing;
             }
             final AmazonAsyncS3Reference clientReference = new AmazonAsyncS3Reference(
-                buildClient(clientSettings, priorityExecutorBuilder, normalExecutorBuilder)
+                buildClient(clientSettings, urgentExecutorBuilder, priorityExecutorBuilder, normalExecutorBuilder)
             );
             clientReference.incRef();
             clientsCache = MapBuilder.newMapBuilder(clientsCache).put(clientSettings, clientReference).immutableMap();
@@ -165,6 +166,7 @@ S3ClientSettings settings(RepositoryMetadata repositoryMetadata) {
     // proxy for testing
     synchronized AmazonAsyncS3WithCredentials buildClient(
         final S3ClientSettings clientSettings,
+        AsyncExecutorContainer urgentExecutorBuilder,
         AsyncExecutorContainer priorityExecutorBuilder,
         AsyncExecutorContainer normalExecutorBuilder
     ) {
@@ -195,6 +197,17 @@ synchronized AmazonAsyncS3WithCredentials buildClient(
             builder.forcePathStyle(true);
         }
 
+        builder.httpClient(buildHttpClient(clientSettings, urgentExecutorBuilder.getAsyncTransferEventLoopGroup()));
+        builder.asyncConfiguration(
+            ClientAsyncConfiguration.builder()
+                .advancedOption(
+                    SdkAdvancedAsyncClientOption.FUTURE_COMPLETION_EXECUTOR,
+                    urgentExecutorBuilder.getFutureCompletionExecutor()
+                )
+                .build()
+        );
+        final S3AsyncClient urgentClient = SocketAccess.doPrivileged(builder::build);
+
         builder.httpClient(buildHttpClient(clientSettings, priorityExecutorBuilder.getAsyncTransferEventLoopGroup()));
         builder.asyncConfiguration(
             ClientAsyncConfiguration.builder()
@@ -217,7 +230,7 @@ synchronized AmazonAsyncS3WithCredentials buildClient(
         );
         final S3AsyncClient client = SocketAccess.doPrivileged(builder::build);
 
-        return AmazonAsyncS3WithCredentials.create(client, priorityClient, credentials);
+        return AmazonAsyncS3WithCredentials.create(client, priorityClient, urgentClient, credentials);
     }
 
     static ClientOverrideConfiguration buildOverrideConfiguration(final S3ClientSettings clientSettings) {
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobContainer.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobContainer.java
index 24aee99242957..c1180aab0e0c7 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobContainer.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobContainer.java
@@ -195,9 +195,14 @@ public void asyncBlobUpload(WriteContext writeContext, ActionListener<Void> comp
             StreamContext streamContext = SocketAccess.doPrivileged(() -> writeContext.getStreamProvider(partSize));
             try (AmazonAsyncS3Reference amazonS3Reference = SocketAccess.doPrivileged(blobStore::asyncClientReference)) {
 
-                S3AsyncClient s3AsyncClient = writeContext.getWritePriority() == WritePriority.HIGH
-                    ? amazonS3Reference.get().priorityClient()
-                    : amazonS3Reference.get().client();
+                S3AsyncClient s3AsyncClient;
+                if (writeContext.getWritePriority() == WritePriority.URGENT) {
+                    s3AsyncClient = amazonS3Reference.get().urgentClient();
+                } else if (writeContext.getWritePriority() == WritePriority.HIGH) {
+                    s3AsyncClient = amazonS3Reference.get().priorityClient();
+                } else {
+                    s3AsyncClient = amazonS3Reference.get().client();
+                }
                 CompletableFuture<Void> completableFuture = blobStore.getAsyncTransferManager()
                     .uploadObject(s3AsyncClient, uploadRequest, streamContext, blobStore.getStatsMetricPublisher());
                 completableFuture.whenComplete((response, throwable) -> {
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java
index f568d871dd31a..e8e043357e126 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java
@@ -84,6 +84,7 @@ class S3BlobStore implements BlobStore {
     private final StatsMetricPublisher statsMetricPublisher = new StatsMetricPublisher();
 
     private final AsyncTransferManager asyncTransferManager;
+    private final AsyncExecutorContainer urgentExecutorBuilder;
     private final AsyncExecutorContainer priorityExecutorBuilder;
     private final AsyncExecutorContainer normalExecutorBuilder;
     private final boolean multipartUploadEnabled;
@@ -100,6 +101,7 @@ class S3BlobStore implements BlobStore {
         int bulkDeletesSize,
         RepositoryMetadata repositoryMetadata,
         AsyncTransferManager asyncTransferManager,
+        AsyncExecutorContainer urgentExecutorBuilder,
         AsyncExecutorContainer priorityExecutorBuilder,
         AsyncExecutorContainer normalExecutorBuilder
     ) {
@@ -116,6 +118,7 @@ class S3BlobStore implements BlobStore {
         this.asyncTransferManager = asyncTransferManager;
         this.normalExecutorBuilder = normalExecutorBuilder;
         this.priorityExecutorBuilder = priorityExecutorBuilder;
+        this.urgentExecutorBuilder = urgentExecutorBuilder;
     }
 
     @Override
@@ -139,7 +142,7 @@ public AmazonS3Reference clientReference() {
     }
 
     public AmazonAsyncS3Reference asyncClientReference() {
-        return s3AsyncService.client(repositoryMetadata, priorityExecutorBuilder, normalExecutorBuilder);
+        return s3AsyncService.client(repositoryMetadata, urgentExecutorBuilder, priorityExecutorBuilder, normalExecutorBuilder);
     }
 
     int getMaxRetries() {
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java
index aaf5b79891cdc..728a99b1220a6 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java
@@ -234,6 +234,7 @@ class S3Repository extends MeteredBlobStoreRepository {
     private final AsyncTransferManager asyncUploadUtils;
     private final S3AsyncService s3AsyncService;
     private final boolean multipartUploadEnabled;
+    private final AsyncExecutorContainer urgentExecutorBuilder;
     private final AsyncExecutorContainer priorityExecutorBuilder;
     private final AsyncExecutorContainer normalExecutorBuilder;
     private final Path pluginConfigPath;
@@ -248,6 +249,7 @@ class S3Repository extends MeteredBlobStoreRepository {
         final ClusterService clusterService,
         final RecoverySettings recoverySettings,
         final AsyncTransferManager asyncUploadUtils,
+        final AsyncExecutorContainer urgentExecutorBuilder,
         final AsyncExecutorContainer priorityExecutorBuilder,
         final AsyncExecutorContainer normalExecutorBuilder,
         final S3AsyncService s3AsyncService,
@@ -260,6 +262,7 @@ class S3Repository extends MeteredBlobStoreRepository {
             clusterService,
             recoverySettings,
             asyncUploadUtils,
+            urgentExecutorBuilder,
             priorityExecutorBuilder,
             normalExecutorBuilder,
             s3AsyncService,
@@ -278,6 +281,7 @@ class S3Repository extends MeteredBlobStoreRepository {
         final ClusterService clusterService,
         final RecoverySettings recoverySettings,
         final AsyncTransferManager asyncUploadUtils,
+        final AsyncExecutorContainer urgentExecutorBuilder,
         final AsyncExecutorContainer priorityExecutorBuilder,
         final AsyncExecutorContainer normalExecutorBuilder,
         final S3AsyncService s3AsyncService,
@@ -290,6 +294,7 @@ class S3Repository extends MeteredBlobStoreRepository {
         this.multipartUploadEnabled = multipartUploadEnabled;
         this.pluginConfigPath = pluginConfigPath;
         this.asyncUploadUtils = asyncUploadUtils;
+        this.urgentExecutorBuilder = urgentExecutorBuilder;
         this.priorityExecutorBuilder = priorityExecutorBuilder;
         this.normalExecutorBuilder = normalExecutorBuilder;
 
@@ -352,6 +357,7 @@ protected S3BlobStore createBlobStore() {
             bulkDeletesSize,
             metadata,
             asyncUploadUtils,
+            urgentExecutorBuilder,
             priorityExecutorBuilder,
             normalExecutorBuilder
         );
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3RepositoryPlugin.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3RepositoryPlugin.java
index c6450e49d08e2..9ed232464d080 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3RepositoryPlugin.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3RepositoryPlugin.java
@@ -75,6 +75,9 @@
  * A plugin to add a repository type that writes to and from the AWS S3.
  */
 public class S3RepositoryPlugin extends Plugin implements RepositoryPlugin, ReloadablePlugin {
+
+    private static final String URGENT_FUTURE_COMPLETION = "urgent_future_completion";
+    private static final String URGENT_STREAM_READER = "urgent_stream_reader";
     private static final String PRIORITY_FUTURE_COMPLETION = "priority_future_completion";
     private static final String PRIORITY_STREAM_READER = "priority_stream_reader";
     private static final String FUTURE_COMPLETION = "future_completion";
@@ -85,6 +88,7 @@ public class S3RepositoryPlugin extends Plugin implements RepositoryPlugin, Relo
 
     private final Path configPath;
 
+    private AsyncExecutorContainer urgentExecutorBuilder;
     private AsyncExecutorContainer priorityExecutorBuilder;
     private AsyncExecutorContainer normalExecutorBuilder;
 
@@ -96,6 +100,10 @@ public S3RepositoryPlugin(final Settings settings, final Path configPath) {
     public List<ExecutorBuilder<?>> getExecutorBuilders(Settings settings) {
         List<ExecutorBuilder<?>> executorBuilders = new ArrayList<>();
         int halfProcMaxAt5 = halfAllocatedProcessorsMaxFive(allocatedProcessors(settings));
+        executorBuilders.add(
+            new FixedExecutorBuilder(settings, URGENT_FUTURE_COMPLETION, urgentPoolCount(settings), 10_000, URGENT_FUTURE_COMPLETION)
+        );
+        executorBuilders.add(new ScalingExecutorBuilder(URGENT_STREAM_READER, 1, halfProcMaxAt5, TimeValue.timeValueMinutes(5)));
         executorBuilders.add(
             new FixedExecutorBuilder(settings, PRIORITY_FUTURE_COMPLETION, priorityPoolCount(settings), 10_000, PRIORITY_FUTURE_COMPLETION)
         );
@@ -128,6 +136,10 @@ private static int allocatedProcessors(Settings settings) {
         return OpenSearchExecutors.allocatedProcessors(settings);
     }
 
+    private static int urgentPoolCount(Settings settings) {
+        return boundedBy((allocatedProcessors(settings) + 7) / 8, 1, 2);
+    }
+
     private static int priorityPoolCount(Settings settings) {
         return boundedBy((allocatedProcessors(settings) + 1) / 2, 2, 4);
     }
@@ -150,8 +162,14 @@ public Collection<Object> createComponents(
         final IndexNameExpressionResolver expressionResolver,
         final Supplier<RepositoriesService> repositoriesServiceSupplier
     ) {
+        int urgentEventLoopThreads = urgentPoolCount(clusterService.getSettings());
         int priorityEventLoopThreads = priorityPoolCount(clusterService.getSettings());
         int normalEventLoopThreads = normalPoolCount(clusterService.getSettings());
+        this.urgentExecutorBuilder = new AsyncExecutorContainer(
+            threadPool.executor(URGENT_FUTURE_COMPLETION),
+            threadPool.executor(URGENT_STREAM_READER),
+            new AsyncTransferEventLoopGroup(urgentEventLoopThreads)
+        );
         this.priorityExecutorBuilder = new AsyncExecutorContainer(
             threadPool.executor(PRIORITY_FUTURE_COMPLETION),
             threadPool.executor(PRIORITY_STREAM_READER),
@@ -176,7 +194,8 @@ protected S3Repository createRepository(
         AsyncTransferManager asyncUploadUtils = new AsyncTransferManager(
             S3Repository.PARALLEL_MULTIPART_UPLOAD_MINIMUM_PART_SIZE_SETTING.get(clusterService.getSettings()).getBytes(),
             normalExecutorBuilder.getStreamReader(),
-            priorityExecutorBuilder.getStreamReader()
+            priorityExecutorBuilder.getStreamReader(),
+            urgentExecutorBuilder.getStreamReader()
         );
         return new S3Repository(
             metadata,
@@ -185,6 +204,7 @@ protected S3Repository createRepository(
             clusterService,
             recoverySettings,
             asyncUploadUtils,
+            urgentExecutorBuilder,
             priorityExecutorBuilder,
             normalExecutorBuilder,
             s3AsyncService,
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncPartsHandler.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncPartsHandler.java
index 6007d9f9c8a1c..933ee6dc29513 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncPartsHandler.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncPartsHandler.java
@@ -48,6 +48,7 @@ public class AsyncPartsHandler {
      * @param s3AsyncClient S3 client to use for upload
      * @param executorService Thread pool for regular upload
      * @param priorityExecutorService Thread pool for priority uploads
+     * @param urgentExecutorService Thread pool for urgent uploads
      * @param uploadRequest request for upload
      * @param streamContext Stream context used in supplying individual file parts
      * @param uploadId Upload Id against which multi-part is being performed
@@ -60,6 +61,7 @@ public static List<CompletableFuture<CompletedPart>> uploadParts(
         S3AsyncClient s3AsyncClient,
         ExecutorService executorService,
         ExecutorService priorityExecutorService,
+        ExecutorService urgentExecutorService,
         UploadRequest uploadRequest,
         StreamContext streamContext,
         String uploadId,
@@ -83,6 +85,7 @@ public static List<CompletableFuture<CompletedPart>> uploadParts(
                 s3AsyncClient,
                 executorService,
                 priorityExecutorService,
+                urgentExecutorService,
                 completedParts,
                 inputStreamContainers,
                 futures,
@@ -129,6 +132,7 @@ private static void uploadPart(
         S3AsyncClient s3AsyncClient,
         ExecutorService executorService,
         ExecutorService priorityExecutorService,
+        ExecutorService urgentExecutorService,
         AtomicReferenceArray<CompletedPart> completedParts,
         AtomicReferenceArray<CheckedContainer> inputStreamContainers,
         List<CompletableFuture<CompletedPart>> futures,
@@ -138,9 +142,14 @@ private static void uploadPart(
     ) {
         Integer partNumber = uploadPartRequest.partNumber();
 
-        ExecutorService streamReadExecutor = uploadRequest.getWritePriority() == WritePriority.HIGH
-            ? priorityExecutorService
-            : executorService;
+        ExecutorService streamReadExecutor;
+        if (uploadRequest.getWritePriority() == WritePriority.URGENT) {
+            streamReadExecutor = urgentExecutorService;
+        } else if (uploadRequest.getWritePriority() == WritePriority.HIGH) {
+            streamReadExecutor = priorityExecutorService;
+        } else {
+            streamReadExecutor = executorService;
+        }
         // Buffered stream is needed to allow mark and reset ops during IO errors so that only buffered
         // data can be retried instead of retrying whole file by the application.
         InputStream inputStream = new BufferedInputStream(inputStreamContainer.getInputStream(), (int) (ByteSizeUnit.MB.toBytes(1) + 1));
diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncTransferManager.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncTransferManager.java
index a52745e33073e..4f1ab9764702e 100644
--- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncTransferManager.java
+++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/async/AsyncTransferManager.java
@@ -61,6 +61,7 @@ public final class AsyncTransferManager {
     private static final Logger log = LogManager.getLogger(AsyncTransferManager.class);
     private final ExecutorService executorService;
     private final ExecutorService priorityExecutorService;
+    private final ExecutorService urgentExecutorService;
     private final long minimumPartSize;
 
     /**
@@ -75,10 +76,16 @@ public final class AsyncTransferManager {
      * @param executorService         The stream reader {@link ExecutorService} for normal priority uploads
      * @param priorityExecutorService The stream read {@link ExecutorService} for high priority uploads
      */
-    public AsyncTransferManager(long minimumPartSize, ExecutorService executorService, ExecutorService priorityExecutorService) {
+    public AsyncTransferManager(
+        long minimumPartSize,
+        ExecutorService executorService,
+        ExecutorService priorityExecutorService,
+        ExecutorService urgentExecutorService
+    ) {
         this.executorService = executorService;
         this.priorityExecutorService = priorityExecutorService;
         this.minimumPartSize = minimumPartSize;
+        this.urgentExecutorService = urgentExecutorService;
     }
 
     /**
@@ -162,6 +169,7 @@ private void doUploadInParts(
                 s3AsyncClient,
                 executorService,
                 priorityExecutorService,
+                urgentExecutorService,
                 uploadRequest,
                 streamContext,
                 uploadId,
@@ -308,9 +316,14 @@ private void uploadInOneChunk(
             putObjectRequestBuilder.checksumAlgorithm(ChecksumAlgorithm.CRC32);
             putObjectRequestBuilder.checksumCRC32(base64StringFromLong(uploadRequest.getExpectedChecksum()));
         }
-        ExecutorService streamReadExecutor = uploadRequest.getWritePriority() == WritePriority.HIGH
-            ? priorityExecutorService
-            : executorService;
+        ExecutorService streamReadExecutor;
+        if (uploadRequest.getWritePriority() == WritePriority.URGENT) {
+            streamReadExecutor = urgentExecutorService;
+        } else if (uploadRequest.getWritePriority() == WritePriority.HIGH) {
+            streamReadExecutor = priorityExecutorService;
+        } else {
+            streamReadExecutor = executorService;
+        }
         // Buffered stream is needed to allow mark and reset ops during IO errors so that only buffered
         // data can be retried instead of retrying whole file by the application.
         InputStream inputStream = new BufferedInputStream(inputStreamContainer.getInputStream(), (int) (ByteSizeUnit.MB.toBytes(1) + 1));
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/RepositoryCredentialsTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/RepositoryCredentialsTests.java
index a4bfe11383b4f..8e1926d40302f 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/RepositoryCredentialsTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/RepositoryCredentialsTests.java
@@ -302,7 +302,7 @@ protected S3Repository createRepository(
             ClusterService clusterService,
             RecoverySettings recoverySettings
         ) {
-            return new S3Repository(metadata, registry, service, clusterService, recoverySettings, null, null, null, null, false) {
+            return new S3Repository(metadata, registry, service, clusterService, recoverySettings, null, null, null, null, null, false) {
                 @Override
                 protected void assertSnapshotOrGenericThread() {
                     // eliminate thread name check as we create repo manually on test/main threads
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3AsyncServiceTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3AsyncServiceTests.java
index e9fe557ab751a..de9ad46bb222d 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3AsyncServiceTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3AsyncServiceTests.java
@@ -44,12 +44,12 @@ public void testCachedClientsAreReleased() {
         final S3ClientSettings otherClientSettings = s3AsyncService.settings(metadata2);
         assertSame(clientSettings, otherClientSettings);
         final AmazonAsyncS3Reference reference = SocketAccess.doPrivileged(
-            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer)
+            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer, asyncExecutorContainer)
         );
         reference.close();
         s3AsyncService.close();
         final AmazonAsyncS3Reference referenceReloaded = SocketAccess.doPrivileged(
-            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer)
+            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer, asyncExecutorContainer)
         );
         assertNotSame(referenceReloaded, reference);
         referenceReloaded.close();
@@ -79,12 +79,12 @@ public void testCachedClientsWithCredentialsAreReleased() {
         final S3ClientSettings otherClientSettings = s3AsyncService.settings(metadata2);
         assertSame(clientSettings, otherClientSettings);
         final AmazonAsyncS3Reference reference = SocketAccess.doPrivileged(
-            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer)
+            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer, asyncExecutorContainer)
         );
         reference.close();
         s3AsyncService.close();
         final AmazonAsyncS3Reference referenceReloaded = SocketAccess.doPrivileged(
-            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer)
+            () -> s3AsyncService.client(metadata1, asyncExecutorContainer, asyncExecutorContainer, asyncExecutorContainer)
         );
         assertNotSame(referenceReloaded, reference);
         referenceReloaded.close();
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerMockClientTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerMockClientTests.java
index 6eb8faa746d34..7c67519f2f3b0 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerMockClientTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerMockClientTests.java
@@ -266,10 +266,11 @@ public void verifySingleChunkUploadCallCount(boolean finalizeUploadFailure) {
         @Override
         public AmazonAsyncS3Reference client(
             RepositoryMetadata repositoryMetadata,
+            AsyncExecutorContainer urgentExecutorBuilder,
             AsyncExecutorContainer priorityExecutorBuilder,
             AsyncExecutorContainer normalExecutorBuilder
         ) {
-            return new AmazonAsyncS3Reference(AmazonAsyncS3WithCredentials.create(asyncClient, asyncClient, null));
+            return new AmazonAsyncS3Reference(AmazonAsyncS3WithCredentials.create(asyncClient, asyncClient, asyncClient, null));
         }
     }
 
@@ -393,9 +394,11 @@ private S3BlobStore createBlobStore() {
             new AsyncTransferManager(
                 S3Repository.PARALLEL_MULTIPART_UPLOAD_MINIMUM_PART_SIZE_SETTING.getDefault(Settings.EMPTY).getBytes(),
                 asyncExecutorContainer.getStreamReader(),
+                asyncExecutorContainer.getStreamReader(),
                 asyncExecutorContainer.getStreamReader()
             ),
             asyncExecutorContainer,
+            asyncExecutorContainer,
             asyncExecutorContainer
         );
     }
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerRetriesTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerRetriesTests.java
index a2214f5218991..ceab06bd051e9 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerRetriesTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobContainerRetriesTests.java
@@ -221,9 +221,11 @@ protected AsyncMultiStreamBlobContainer createBlobContainer(
                 new AsyncTransferManager(
                     S3Repository.PARALLEL_MULTIPART_UPLOAD_MINIMUM_PART_SIZE_SETTING.getDefault(Settings.EMPTY).getBytes(),
                     asyncExecutorContainer.getStreamReader(),
+                    asyncExecutorContainer.getStreamReader(),
                     asyncExecutorContainer.getStreamReader()
                 ),
                 asyncExecutorContainer,
+                asyncExecutorContainer,
                 asyncExecutorContainer
             )
         ) {
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobStoreContainerTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobStoreContainerTests.java
index 2701cae6a733b..58ad290a31e85 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobStoreContainerTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3BlobStoreContainerTests.java
@@ -935,7 +935,7 @@ public void testReadBlobAsyncMultiPart() throws Exception {
 
         final S3AsyncClient s3AsyncClient = mock(S3AsyncClient.class);
         final AmazonAsyncS3Reference amazonAsyncS3Reference = new AmazonAsyncS3Reference(
-            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, null)
+            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, s3AsyncClient, null)
         );
 
         final S3BlobStore blobStore = mock(S3BlobStore.class);
@@ -993,7 +993,7 @@ public void testReadBlobAsyncSinglePart() throws Exception {
 
         final S3AsyncClient s3AsyncClient = mock(S3AsyncClient.class);
         final AmazonAsyncS3Reference amazonAsyncS3Reference = new AmazonAsyncS3Reference(
-            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, null)
+            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, s3AsyncClient, null)
         );
         final S3BlobStore blobStore = mock(S3BlobStore.class);
         final BlobPath blobPath = new BlobPath();
@@ -1048,7 +1048,7 @@ public void testReadBlobAsyncFailure() throws Exception {
 
         final S3AsyncClient s3AsyncClient = mock(S3AsyncClient.class);
         final AmazonAsyncS3Reference amazonAsyncS3Reference = new AmazonAsyncS3Reference(
-            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, null)
+            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, s3AsyncClient, null)
         );
 
         final S3BlobStore blobStore = mock(S3BlobStore.class);
@@ -1091,7 +1091,7 @@ public void testReadBlobAsyncOnCompleteFailureMissingData() throws Exception {
 
         final S3AsyncClient s3AsyncClient = mock(S3AsyncClient.class);
         final AmazonAsyncS3Reference amazonAsyncS3Reference = new AmazonAsyncS3Reference(
-            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, null)
+            AmazonAsyncS3WithCredentials.create(s3AsyncClient, s3AsyncClient, s3AsyncClient, null)
         );
 
         final S3BlobStore blobStore = mock(S3BlobStore.class);
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java
index e65ca69a5047b..6fec535ae6301 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java
@@ -168,6 +168,7 @@ private S3Repository createS3Repo(RepositoryMetadata metadata) {
             null,
             null,
             null,
+            null,
             false
         ) {
             @Override
diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/async/AsyncTransferManagerTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/async/AsyncTransferManagerTests.java
index 97a746cdeed93..2437547a80a6f 100644
--- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/async/AsyncTransferManagerTests.java
+++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/async/AsyncTransferManagerTests.java
@@ -64,6 +64,7 @@ public void setUp() throws Exception {
         asyncTransferManager = new AsyncTransferManager(
             ByteSizeUnit.MB.toBytes(5),
             Executors.newSingleThreadExecutor(),
+            Executors.newSingleThreadExecutor(),
             Executors.newSingleThreadExecutor()
         );
         super.setUp();
diff --git a/server/src/main/java/org/opensearch/common/blobstore/stream/write/WritePriority.java b/server/src/main/java/org/opensearch/common/blobstore/stream/write/WritePriority.java
index b8c0b52f93a3c..3f341c878c3c7 100644
--- a/server/src/main/java/org/opensearch/common/blobstore/stream/write/WritePriority.java
+++ b/server/src/main/java/org/opensearch/common/blobstore/stream/write/WritePriority.java
@@ -15,5 +15,6 @@
  */
 public enum WritePriority {
     NORMAL,
-    HIGH
+    HIGH,
+    URGENT
 }
diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index ae4a3fab9852d..025ad075d83b6 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -378,7 +378,7 @@ private String writeGlobalMetadata(ClusterState clusterState) throws IOException
             result.set(globalMetadataContainer.path().buildAsString() + globalMetadataFilename);
         }, ex -> { throw new GlobalMetadataTransferException(ex.getMessage(), ex); }), latch);
 
-        GLOBAL_METADATA_FORMAT.writeAsync(
+        GLOBAL_METADATA_FORMAT.writeAsyncWithUrgentPriority(
             clusterState.metadata(),
             globalMetadataContainer,
             globalMetadataFilename,
@@ -510,7 +510,7 @@ private void writeIndexMetadataAsync(
             ex -> latchedActionListener.onFailure(new IndexMetadataTransferException(indexMetadata.getIndex().toString(), ex))
         );
 
-        INDEX_METADATA_FORMAT.writeAsync(
+        INDEX_METADATA_FORMAT.writeAsyncWithUrgentPriority(
             indexMetadata,
             indexMetadataContainer,
             indexMetadataFilename,
diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/ChecksumBlobStoreFormat.java b/server/src/main/java/org/opensearch/repositories/blobstore/ChecksumBlobStoreFormat.java
index e280141c12bc1..3e6052a5ef820 100644
--- a/server/src/main/java/org/opensearch/repositories/blobstore/ChecksumBlobStoreFormat.java
+++ b/server/src/main/java/org/opensearch/repositories/blobstore/ChecksumBlobStoreFormat.java
@@ -197,21 +197,56 @@ public void write(
     }
 
     /**
-     * Writes blob with resolving the blob name using {@link #blobName} method.
-     * Leverages the multipart upload if supported by the blobContainer.
+     * Internally calls {@link #writeAsyncWithPriority} with {@link WritePriority#NORMAL}
+     */
+    public void writeAsync(
+        final T obj,
+        final BlobContainer blobContainer,
+        final String name,
+        final Compressor compressor,
+        ActionListener<Void> listener,
+        final ToXContent.Params params
+    ) throws IOException {
+        // use NORMAL priority by default
+        this.writeAsyncWithPriority(obj, blobContainer, name, compressor, WritePriority.NORMAL, listener, params);
+    }
+
+    /**
+     * Internally calls {@link #writeAsyncWithPriority} with {@link WritePriority#URGENT}
+     * <p>
+     * <b>NOTE:</b> We use this method to upload urgent priority objects like cluster state to remote stores.
+     * Use {@link #writeAsync(ToXContent, BlobContainer, String, Compressor, ActionListener, ToXContent.Params)} for
+     * other use cases.
+     */
+    public void writeAsyncWithUrgentPriority(
+        final T obj,
+        final BlobContainer blobContainer,
+        final String name,
+        final Compressor compressor,
+        ActionListener<Void> listener,
+        final ToXContent.Params params
+    ) throws IOException {
+        this.writeAsyncWithPriority(obj, blobContainer, name, compressor, WritePriority.URGENT, listener, params);
+    }
+
+    /**
+     * Method to writes blob with resolving the blob name using {@link #blobName} method with specified
+     * {@link WritePriority}. Leverages the multipart upload if supported by the blobContainer.
      *
      * @param obj                 object to be serialized
      * @param blobContainer       blob container
      * @param name                blob name
      * @param compressor          whether to use compression
+     * @param priority            write priority to be used
      * @param listener            listener to listen to write result
      * @param params              ToXContent params
      */
-    public void writeAsync(
+    private void writeAsyncWithPriority(
         final T obj,
         final BlobContainer blobContainer,
         final String name,
         final Compressor compressor,
+        final WritePriority priority,
         ActionListener<Void> listener,
         final ToXContent.Params params
     ) throws IOException {
@@ -222,7 +257,7 @@ public void writeAsync(
         }
         final String blobName = blobName(name);
         final BytesReference bytes = serialize(obj, blobName, compressor, params);
-        final String resourceDescription = "ChecksumBlobStoreFormat.writeAsync(blob=\"" + blobName + "\")";
+        final String resourceDescription = "ChecksumBlobStoreFormat.writeAsyncWithPriority(blob=\"" + blobName + "\")";
         try (IndexInput input = new ByteArrayIndexInput(resourceDescription, BytesReference.toBytes(bytes))) {
             long expectedChecksum;
             try {
@@ -242,7 +277,7 @@ public void writeAsync(
                     blobName,
                     bytes.length(),
                     true,
-                    WritePriority.HIGH,
+                    priority,
                     (size, position) -> new OffsetRangeIndexInputStream(input, size, position),
                     expectedChecksum,
                     ((AsyncMultiStreamBlobContainer) blobContainer).remoteIntegrityCheckSupported()
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index 4be5fc03c2a6d..173e15b8eca37 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -273,7 +273,7 @@ public void testWriteFullMetadataInParallelSuccess() throws IOException {
             new BytesArray(writtenBytes)
         );
 
-        assertEquals(capturedWriteContext.getWritePriority(), WritePriority.HIGH);
+        assertEquals(capturedWriteContext.getWritePriority(), WritePriority.URGENT);
         assertEquals(writtenIndexMetadata.getNumberOfShards(), 1);
         assertEquals(writtenIndexMetadata.getNumberOfReplicas(), 0);
         assertEquals(writtenIndexMetadata.getIndex().getName(), "test-index");
diff --git a/server/src/test/java/org/opensearch/snapshots/BlobStoreFormatTests.java b/server/src/test/java/org/opensearch/snapshots/BlobStoreFormatTests.java
index c114b56bd0b39..c5f36fcc01983 100644
--- a/server/src/test/java/org/opensearch/snapshots/BlobStoreFormatTests.java
+++ b/server/src/test/java/org/opensearch/snapshots/BlobStoreFormatTests.java
@@ -43,6 +43,7 @@
 import org.opensearch.common.blobstore.fs.FsBlobStore;
 import org.opensearch.common.blobstore.stream.read.ReadContext;
 import org.opensearch.common.blobstore.stream.write.WriteContext;
+import org.opensearch.common.blobstore.stream.write.WritePriority;
 import org.opensearch.common.compress.DeflateCompressor;
 import org.opensearch.common.io.Streams;
 import org.opensearch.common.io.stream.BytesStreamOutput;
@@ -65,8 +66,13 @@
 import java.util.Map;
 import java.util.concurrent.CountDownLatch;
 
+import org.mockito.ArgumentCaptor;
+
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.greaterThan;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
 
 public class BlobStoreFormatTests extends OpenSearchTestCase {
 
@@ -128,44 +134,36 @@ public void testBlobStoreAsyncOperations() throws IOException, InterruptedExcept
             BlobPath.cleanPath(),
             null
         );
+        MockFsVerifyingBlobContainer spyContainer = spy(mockBlobContainer);
         ChecksumBlobStoreFormat<BlobObj> checksumSMILE = new ChecksumBlobStoreFormat<>(BLOB_CODEC, "%s", BlobObj::fromXContent);
-
+        ArgumentCaptor<ActionListener<Void>> actionListenerArgumentCaptor = ArgumentCaptor.forClass(ActionListener.class);
+        ArgumentCaptor<WriteContext> writeContextArgumentCaptor = ArgumentCaptor.forClass(WriteContext.class);
         CountDownLatch latch = new CountDownLatch(2);
 
-        ActionListener<Void> actionListener = new ActionListener<>() {
-            @Override
-            public void onResponse(Void unused) {
-                logger.info("---> Async write succeeded");
-                latch.countDown();
-            }
-
-            @Override
-            public void onFailure(Exception e) {
-                logger.info("---> Failure in async write");
-                throw new RuntimeException("async write should not fail");
-            }
-        };
-
         // Write blobs in different formats
         checksumSMILE.writeAsync(
             new BlobObj("checksum smile"),
-            mockBlobContainer,
+            spyContainer,
             "check-smile",
             CompressorRegistry.none(),
-            actionListener,
+            getVoidActionListener(latch),
             ChecksumBlobStoreFormat.SNAPSHOT_ONLY_FORMAT_PARAMS
         );
         checksumSMILE.writeAsync(
             new BlobObj("checksum smile compressed"),
-            mockBlobContainer,
+            spyContainer,
             "check-smile-comp",
             CompressorRegistry.getCompressor(DeflateCompressor.NAME),
-            actionListener,
+            getVoidActionListener(latch),
             ChecksumBlobStoreFormat.SNAPSHOT_ONLY_FORMAT_PARAMS
         );
 
         latch.await();
 
+        verify(spyContainer, times(2)).asyncBlobUpload(writeContextArgumentCaptor.capture(), actionListenerArgumentCaptor.capture());
+        assertEquals(2, writeContextArgumentCaptor.getAllValues().size());
+        writeContextArgumentCaptor.getAllValues()
+            .forEach(writeContext -> assertEquals(WritePriority.NORMAL, writeContext.getWritePriority()));
         // Assert that all checksum blobs can be read
         assertEquals(checksumSMILE.read(mockBlobContainer.getDelegate(), "check-smile", xContentRegistry()).getText(), "checksum smile");
         assertEquals(
@@ -174,6 +172,39 @@ public void onFailure(Exception e) {
         );
     }
 
+    public void testBlobStorePriorityAsyncOperation() throws IOException, InterruptedException {
+        BlobStore blobStore = createTestBlobStore();
+        MockFsVerifyingBlobContainer mockBlobContainer = new MockFsVerifyingBlobContainer(
+            (FsBlobStore) blobStore,
+            BlobPath.cleanPath(),
+            null
+        );
+        MockFsVerifyingBlobContainer spyContainer = spy(mockBlobContainer);
+        ChecksumBlobStoreFormat<BlobObj> checksumSMILE = new ChecksumBlobStoreFormat<>(BLOB_CODEC, "%s", BlobObj::fromXContent);
+
+        ArgumentCaptor<ActionListener<Void>> actionListenerArgumentCaptor = ArgumentCaptor.forClass(ActionListener.class);
+        ArgumentCaptor<WriteContext> writeContextArgumentCaptor = ArgumentCaptor.forClass(WriteContext.class);
+        CountDownLatch latch = new CountDownLatch(1);
+
+        // Write blobs in different formats
+        checksumSMILE.writeAsyncWithUrgentPriority(
+            new BlobObj("cluster state diff"),
+            spyContainer,
+            "cluster-state-diff",
+            CompressorRegistry.none(),
+            getVoidActionListener(latch),
+            ChecksumBlobStoreFormat.SNAPSHOT_ONLY_FORMAT_PARAMS
+        );
+        latch.await();
+
+        verify(spyContainer).asyncBlobUpload(writeContextArgumentCaptor.capture(), actionListenerArgumentCaptor.capture());
+        assertEquals(WritePriority.URGENT, writeContextArgumentCaptor.getValue().getWritePriority());
+        assertEquals(
+            checksumSMILE.read(mockBlobContainer.getDelegate(), "cluster-state-diff", xContentRegistry()).getText(),
+            "cluster state diff"
+        );
+    }
+
     public void testBlobStoreOperations() throws IOException {
         BlobStore blobStore = createTestBlobStore();
         BlobContainer blobContainer = blobStore.blobContainer(BlobPath.cleanPath());
@@ -228,6 +259,24 @@ public void testBlobCorruption() throws IOException {
         }
     }
 
+    private ActionListener<Void> getVoidActionListener(CountDownLatch latch) {
+        ActionListener<Void> actionListener = new ActionListener<>() {
+            @Override
+            public void onResponse(Void unused) {
+                logger.info("---> Async write succeeded");
+                latch.countDown();
+            }
+
+            @Override
+            public void onFailure(Exception e) {
+                logger.info("---> Failure in async write");
+                throw new RuntimeException("async write should not fail");
+            }
+        };
+
+        return actionListener;
+    }
+
     protected BlobStore createTestBlobStore() throws IOException {
         return new FsBlobStore(randomIntBetween(1, 8) * 1024, createTempDir(), false);
     }

From 54e74a84437238c6154b0a15d209a9a1ecbaa4bb Mon Sep 17 00:00:00 2001
From: Aman Khare <85096200+amkhar@users.noreply.github.com>
Date: Tue, 24 Oct 2023 14:28:29 +0530
Subject: [PATCH 05/33] Add cluster state stats (#10670)

* Add cluster state update stats along with remote upload stats around success/ failure, latency metric

Signed-off-by: Aman Khare <amkhar@amazon.com>
---
 CHANGELOG.md                                  |   1 +
 .../discovery/ClusterManagerDisruptionIT.java |   3 +
 .../remote/RemoteClusterStateServiceIT.java   |  43 ++++++
 .../coordination/CoordinationState.java       |   6 +
 .../cluster/coordination/Coordinator.java     |  12 +-
 .../coordination/InMemoryPersistedState.java  |   5 +
 .../coordination/PersistedStateStats.java     | 126 ++++++++++++++++++
 .../cluster/service/ClusterStateStats.java    | 120 +++++++++++++++++
 .../cluster/service/MasterService.java        |  23 +++-
 .../opensearch/discovery/DiscoveryStats.java  |  21 ++-
 .../opensearch/gateway/GatewayMetaState.java  |  13 ++
 .../remote/RemoteClusterStateService.java     |  20 ++-
 .../remote/RemotePersistenceStats.java        |  37 +++++
 .../cluster/node/stats/NodeStatsTests.java    |  29 +++-
 .../cluster/service/MasterServiceTests.java   |   3 +
 .../GatewayMetaStatePersistedStateTests.java  |  22 +++
 .../RemoteClusterStateServiceTests.java       |  34 +++++
 .../AbstractCoordinatorTestCase.java          |   5 +
 18 files changed, 511 insertions(+), 12 deletions(-)
 create mode 100644 server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
 create mode 100644 server/src/main/java/org/opensearch/cluster/service/ClusterStateStats.java
 create mode 100644 server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b0d9720ad208..8c7e3ee151d64 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -96,6 +96,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Introduce ConcurrentQueryProfiler to profile query using concurrent segment search path and support concurrency during rewrite and create weight ([10352](https://github.com/opensearch-project/OpenSearch/pull/10352))
 - [Remote cluster state] Make index and global metadata upload timeout dynamic cluster settings ([#10814](https://github.com/opensearch-project/OpenSearch/pull/10814))
 - Added cluster setting cluster.restrict.index.replication_type to restrict setting of index setting replication type ([#10866](https://github.com/opensearch-project/OpenSearch/pull/10866))
+- Add cluster state stats ([#10670](https://github.com/opensearch-project/OpenSearch/pull/10670))
 
 ### Dependencies
 - Bump `com.google.api.grpc:proto-google-common-protos` from 2.10.0 to 2.25.1 ([#10208](https://github.com/opensearch-project/OpenSearch/pull/10208), [#10298](https://github.com/opensearch-project/OpenSearch/pull/10298))
diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/ClusterManagerDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/ClusterManagerDisruptionIT.java
index 1463c45aa9b2f..79f6ba6dfa642 100644
--- a/server/src/internalClusterTest/java/org/opensearch/discovery/ClusterManagerDisruptionIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/discovery/ClusterManagerDisruptionIT.java
@@ -39,6 +39,7 @@
 import org.opensearch.cluster.ClusterState;
 import org.opensearch.cluster.coordination.NoClusterManagerBlockService;
 import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.cluster.service.ClusterStateStats;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.unit.TimeValue;
 import org.opensearch.core.xcontent.MediaTypeRegistry;
@@ -199,6 +200,8 @@ public void testIsolateClusterManagerAndVerifyClusterStateConsensus() throws Exc
                 }
 
             }
+            ClusterStateStats clusterStateStats = internalCluster().clusterService().getClusterManagerService().getClusterStateStats();
+            assertTrue(clusterStateStats.getUpdateFailed() > 0);
         });
     }
 
diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
index 7304304e522f8..59eef3c06844b 100644
--- a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
@@ -8,9 +8,12 @@
 
 package org.opensearch.gateway.remote;
 
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest;
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse;
 import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.common.blobstore.BlobPath;
 import org.opensearch.common.settings.Settings;
+import org.opensearch.discovery.DiscoveryStats;
 import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase;
 import org.opensearch.repositories.RepositoriesService;
 import org.opensearch.repositories.blobstore.BlobStoreRepository;
@@ -19,6 +22,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Base64;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
 import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_CLUSTER_STATE_ENABLED_SETTING;
@@ -94,6 +98,45 @@ public void testFullClusterRestoreStaleDelete() throws Exception {
         assertEquals(shardCount, indexMetadataMap.values().stream().findFirst().get().getNumberOfShards());
     }
 
+    public void testRemoteStateStats() {
+        int shardCount = randomIntBetween(1, 2);
+        int replicaCount = 1;
+        int dataNodeCount = shardCount * (replicaCount + 1);
+        int clusterManagerNodeCount = 1;
+        prepareCluster(clusterManagerNodeCount, dataNodeCount, INDEX_NAME, replicaCount, shardCount);
+        String clusterManagerNode = internalCluster().getClusterManagerName();
+        String dataNode = internalCluster().getDataNodeNames().stream().collect(Collectors.toList()).get(0);
+
+        // Fetch _nodes/stats
+        NodesStatsResponse nodesStatsResponse = client().admin()
+            .cluster()
+            .prepareNodesStats(clusterManagerNode)
+            .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+            .get();
+
+        // assert cluster state stats
+        DiscoveryStats discoveryStats = nodesStatsResponse.getNodes().get(0).getDiscoveryStats();
+
+        assertNotNull(discoveryStats.getClusterStateStats());
+        assertTrue(discoveryStats.getClusterStateStats().getUpdateSuccess() > 1);
+        assertEquals(0, discoveryStats.getClusterStateStats().getUpdateFailed());
+        assertTrue(discoveryStats.getClusterStateStats().getUpdateTotalTimeInMillis() > 0);
+        // assert remote state stats
+        assertTrue(discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getSuccessCount() > 1);
+        assertEquals(0, discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getFailedCount());
+        assertTrue(discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getTotalTimeInMillis() > 0);
+
+        NodesStatsResponse nodesStatsResponseDataNode = client().admin()
+            .cluster()
+            .prepareNodesStats(dataNode)
+            .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+            .get();
+        // assert cluster state stats for data node
+        DiscoveryStats dataNodeDiscoveryStats = nodesStatsResponseDataNode.getNodes().get(0).getDiscoveryStats();
+        assertNotNull(dataNodeDiscoveryStats.getClusterStateStats());
+        assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getUpdateSuccess());
+    }
+
     private void setReplicaCount(int replicaCount) {
         client().admin()
             .indices()
diff --git a/server/src/main/java/org/opensearch/cluster/coordination/CoordinationState.java b/server/src/main/java/org/opensearch/cluster/coordination/CoordinationState.java
index a339852e6ed8d..987a3e3ffa7d3 100644
--- a/server/src/main/java/org/opensearch/cluster/coordination/CoordinationState.java
+++ b/server/src/main/java/org/opensearch/cluster/coordination/CoordinationState.java
@@ -638,6 +638,12 @@ public interface PersistedState extends Closeable {
          */
         void setLastAcceptedState(ClusterState clusterState);
 
+        /**
+         * Returns the stats for the persistence layer for {@link CoordinationState}.
+         * @return PersistedStateStats
+         */
+        PersistedStateStats getStats();
+
         /**
          * Marks the last accepted cluster state as committed.
          * After a successful call to this method, {@link #getLastAcceptedState()} should return the last cluster state that was set,
diff --git a/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java
index eb30460ca1b7f..a4ffab7fb70c9 100644
--- a/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java
+++ b/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java
@@ -56,6 +56,7 @@
 import org.opensearch.cluster.service.ClusterApplier;
 import org.opensearch.cluster.service.ClusterApplier.ClusterApplyListener;
 import org.opensearch.cluster.service.ClusterManagerService;
+import org.opensearch.cluster.service.ClusterStateStats;
 import org.opensearch.common.Booleans;
 import org.opensearch.common.Nullable;
 import org.opensearch.common.Priority;
@@ -865,7 +866,16 @@ protected void doStart() {
 
     @Override
     public DiscoveryStats stats() {
-        return new DiscoveryStats(new PendingClusterStateStats(0, 0, 0), publicationHandler.stats());
+        ClusterStateStats clusterStateStats = clusterManagerService.getClusterStateStats();
+        ArrayList<PersistedStateStats> stats = new ArrayList<>();
+        Stream.of(PersistedStateRegistry.PersistedStateType.values()).forEach(stateType -> {
+            if (persistedStateRegistry.getPersistedState(stateType) != null
+                && persistedStateRegistry.getPersistedState(stateType).getStats() != null) {
+                stats.add(persistedStateRegistry.getPersistedState(stateType).getStats());
+            }
+        });
+        clusterStateStats.setPersistenceStats(stats);
+        return new DiscoveryStats(new PendingClusterStateStats(0, 0, 0), publicationHandler.stats(), clusterStateStats);
     }
 
     @Override
diff --git a/server/src/main/java/org/opensearch/cluster/coordination/InMemoryPersistedState.java b/server/src/main/java/org/opensearch/cluster/coordination/InMemoryPersistedState.java
index 67ef82ee7b2e9..b77ede5471534 100644
--- a/server/src/main/java/org/opensearch/cluster/coordination/InMemoryPersistedState.java
+++ b/server/src/main/java/org/opensearch/cluster/coordination/InMemoryPersistedState.java
@@ -65,6 +65,11 @@ public void setLastAcceptedState(ClusterState clusterState) {
         this.acceptedState = clusterState;
     }
 
+    @Override
+    public PersistedStateStats getStats() {
+        return null;
+    }
+
     @Override
     public long getCurrentTerm() {
         return currentTerm;
diff --git a/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java b/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
new file mode 100644
index 0000000000000..1dc20e564ade2
--- /dev/null
+++ b/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
@@ -0,0 +1,126 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.cluster.coordination;
+
+import org.opensearch.core.common.io.stream.StreamInput;
+import org.opensearch.core.common.io.stream.StreamOutput;
+import org.opensearch.core.common.io.stream.Writeable;
+import org.opensearch.core.xcontent.ToXContentObject;
+import org.opensearch.core.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Persisted cluster state related stats.
+ *
+ * @opensearch.internal
+ */
+public class PersistedStateStats implements Writeable, ToXContentObject {
+    private String statsName;
+    private AtomicLong totalTimeInMillis = new AtomicLong(0);
+    private AtomicLong failedCount = new AtomicLong(0);
+    private AtomicLong successCount = new AtomicLong(0);
+    private Map<String, AtomicLong> extendedFields = new HashMap<>(); // keeping minimal extensibility
+
+    public PersistedStateStats(String statsName) {
+        this.statsName = statsName;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeVLong(successCount.get());
+        out.writeVLong(failedCount.get());
+        out.writeVLong(totalTimeInMillis.get());
+        if (extendedFields.size() > 0) {
+            out.writeBoolean(true);
+            out.writeVInt(extendedFields.size());
+            for (Map.Entry<String, AtomicLong> extendedField : extendedFields.entrySet()) {
+                out.writeString(extendedField.getKey());
+                out.writeVLong(extendedField.getValue().get());
+            }
+        } else {
+            out.writeBoolean(false);
+        }
+    }
+
+    public PersistedStateStats(StreamInput in) throws IOException {
+        this.successCount = new AtomicLong(in.readVLong());
+        this.failedCount = new AtomicLong(in.readVLong());
+        this.totalTimeInMillis = new AtomicLong(in.readVLong());
+        if (in.readBoolean()) {
+            int extendedFieldsSize = in.readVInt();
+            this.extendedFields = new HashMap<>();
+            for (int fieldNumber = 0; fieldNumber < extendedFieldsSize; fieldNumber++) {
+                extendedFields.put(in.readString(), new AtomicLong(in.readVLong()));
+            }
+        }
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject(statsName);
+        builder.field(Fields.SUCCESS_COUNT, getSuccessCount());
+        builder.field(Fields.FAILED_COUNT, getFailedCount());
+        builder.field(Fields.TOTAL_TIME_IN_MILLIS, getTotalTimeInMillis());
+        if (extendedFields.size() > 0) {
+            for (Map.Entry<String, AtomicLong> extendedField : extendedFields.entrySet()) {
+                builder.field(extendedField.getKey(), extendedField.getValue().get());
+            }
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    public void stateFailed() {
+        failedCount.incrementAndGet();
+    }
+
+    public void stateSucceeded() {
+        successCount.incrementAndGet();
+    }
+
+    /**
+     * Expects user to send time taken in milliseconds.
+     *
+     * @param timeTakenInUpload time taken in uploading the cluster state to remote
+     */
+    public void stateTook(long timeTakenInUpload) {
+        totalTimeInMillis.addAndGet(timeTakenInUpload);
+    }
+
+    public long getTotalTimeInMillis() {
+        return totalTimeInMillis.get();
+    }
+
+    public long getFailedCount() {
+        return failedCount.get();
+    }
+
+    public long getSuccessCount() {
+        return successCount.get();
+    }
+
+    protected void addToExtendedFields(String extendedField, AtomicLong extendedFieldValue) {
+        this.extendedFields.put(extendedField, extendedFieldValue);
+    }
+
+    /**
+     * Fields for parsing and toXContent
+     *
+     * @opensearch.internal
+     */
+    static final class Fields {
+        static final String SUCCESS_COUNT = "success_count";
+        static final String TOTAL_TIME_IN_MILLIS = "total_time_in_millis";
+        static final String FAILED_COUNT = "failed_count";
+    }
+}
diff --git a/server/src/main/java/org/opensearch/cluster/service/ClusterStateStats.java b/server/src/main/java/org/opensearch/cluster/service/ClusterStateStats.java
new file mode 100644
index 0000000000000..96683ce720d0b
--- /dev/null
+++ b/server/src/main/java/org/opensearch/cluster/service/ClusterStateStats.java
@@ -0,0 +1,120 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.cluster.service;
+
+import org.opensearch.cluster.coordination.PersistedStateStats;
+import org.opensearch.core.common.io.stream.StreamInput;
+import org.opensearch.core.common.io.stream.StreamOutput;
+import org.opensearch.core.common.io.stream.Writeable;
+import org.opensearch.core.xcontent.ToXContentObject;
+import org.opensearch.core.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Cluster state related stats.
+ *
+ * @opensearch.internal
+ */
+public class ClusterStateStats implements Writeable, ToXContentObject {
+
+    private AtomicLong updateSuccess = new AtomicLong(0);
+    private AtomicLong updateTotalTimeInMillis = new AtomicLong(0);
+    private AtomicLong updateFailed = new AtomicLong(0);
+    private List<PersistedStateStats> persistenceStats = new ArrayList<>();
+
+    public ClusterStateStats() {}
+
+    public long getUpdateSuccess() {
+        return updateSuccess.get();
+    }
+
+    public long getUpdateTotalTimeInMillis() {
+        return updateTotalTimeInMillis.get();
+    }
+
+    public long getUpdateFailed() {
+        return updateFailed.get();
+    }
+
+    public List<PersistedStateStats> getPersistenceStats() {
+        return persistenceStats;
+    }
+
+    public void stateUpdated() {
+        updateSuccess.incrementAndGet();
+    }
+
+    public void stateUpdateFailed() {
+        updateFailed.incrementAndGet();
+    }
+
+    public void stateUpdateTook(long stateUpdateTime) {
+        updateTotalTimeInMillis.addAndGet(stateUpdateTime);
+    }
+
+    public ClusterStateStats setPersistenceStats(List<PersistedStateStats> persistenceStats) {
+        this.persistenceStats = persistenceStats;
+        return this;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeVLong(updateSuccess.get());
+        out.writeVLong(updateTotalTimeInMillis.get());
+        out.writeVLong(updateFailed.get());
+        out.writeVInt(persistenceStats.size());
+        for (PersistedStateStats stats : persistenceStats) {
+            stats.writeTo(out);
+        }
+    }
+
+    public ClusterStateStats(StreamInput in) throws IOException {
+        this.updateSuccess = new AtomicLong(in.readVLong());
+        this.updateTotalTimeInMillis = new AtomicLong(in.readVLong());
+        this.updateFailed = new AtomicLong(in.readVLong());
+        int persistedStatsSize = in.readVInt();
+        this.persistenceStats = new ArrayList<>();
+        for (int statsNumber = 0; statsNumber < persistedStatsSize; statsNumber++) {
+            PersistedStateStats stats = new PersistedStateStats(in);
+            this.persistenceStats.add(stats);
+        }
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject(Fields.CLUSTER_STATE_STATS);
+        builder.startObject(Fields.OVERALL);
+        builder.field(Fields.UPDATE_COUNT, getUpdateSuccess());
+        builder.field(Fields.TOTAL_TIME_IN_MILLIS, getUpdateTotalTimeInMillis());
+        builder.field(Fields.FAILED_COUNT, getUpdateFailed());
+        builder.endObject();
+        for (PersistedStateStats stats : persistenceStats) {
+            stats.toXContent(builder, params);
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    /**
+     * Fields for parsing and toXContent
+     *
+     * @opensearch.internal
+     */
+    static final class Fields {
+        static final String CLUSTER_STATE_STATS = "cluster_state_stats";
+        static final String OVERALL = "overall";
+        static final String UPDATE_COUNT = "update_count";
+        static final String TOTAL_TIME_IN_MILLIS = "total_time_in_millis";
+        static final String FAILED_COUNT = "failed_count";
+    }
+}
diff --git a/server/src/main/java/org/opensearch/cluster/service/MasterService.java b/server/src/main/java/org/opensearch/cluster/service/MasterService.java
index 563b69dfd0e2a..07c3f93ae6486 100644
--- a/server/src/main/java/org/opensearch/cluster/service/MasterService.java
+++ b/server/src/main/java/org/opensearch/cluster/service/MasterService.java
@@ -112,7 +112,9 @@ public class MasterService extends AbstractLifecycleComponent {
 
     static final String CLUSTER_MANAGER_UPDATE_THREAD_NAME = "clusterManagerService#updateTask";
 
-    /** @deprecated As of 2.2, because supporting inclusive language, replaced by {@link #CLUSTER_MANAGER_UPDATE_THREAD_NAME} */
+    /**
+     * @deprecated As of 2.2, because supporting inclusive language, replaced by {@link #CLUSTER_MANAGER_UPDATE_THREAD_NAME}
+     */
     @Deprecated
     static final String MASTER_UPDATE_THREAD_NAME = "masterService#updateTask";
 
@@ -130,6 +132,7 @@ public class MasterService extends AbstractLifecycleComponent {
     private volatile Batcher taskBatcher;
     protected final ClusterManagerTaskThrottler clusterManagerTaskThrottler;
     private final ClusterManagerThrottlingStats throttlingStats;
+    private final ClusterStateStats stateStats;
 
     public MasterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) {
         this.nodeName = Objects.requireNonNull(Node.NODE_NAME_SETTING.get(settings));
@@ -147,6 +150,7 @@ public MasterService(Settings settings, ClusterSettings clusterSettings, ThreadP
             this::getMinNodeVersion,
             throttlingStats
         );
+        this.stateStats = new ClusterStateStats();
         this.threadPool = threadPool;
     }
 
@@ -339,7 +343,7 @@ private TimeValue getTimeSince(long startTimeNanos) {
         return TimeValue.timeValueMillis(TimeValue.nsecToMSec(threadPool.preciseRelativeTimeInNanos() - startTimeNanos));
     }
 
-    protected void publish(ClusterChangedEvent clusterChangedEvent, TaskOutputs taskOutputs, long startTimeMillis) {
+    protected void publish(ClusterChangedEvent clusterChangedEvent, TaskOutputs taskOutputs, long startTimeNanos) {
         final PlainActionFuture<Void> fut = new PlainActionFuture<Void>() {
             @Override
             protected boolean blockingAllowed() {
@@ -352,8 +356,12 @@ protected boolean blockingAllowed() {
         try {
             FutureUtils.get(fut);
             onPublicationSuccess(clusterChangedEvent, taskOutputs);
+            final long durationMillis = getTimeSince(startTimeNanos).millis();
+            stateStats.stateUpdateTook(durationMillis);
+            stateStats.stateUpdated();
         } catch (Exception e) {
-            onPublicationFailed(clusterChangedEvent, taskOutputs, startTimeMillis, e);
+            stateStats.stateUpdateFailed();
+            onPublicationFailed(clusterChangedEvent, taskOutputs, startTimeNanos, e);
         }
     }
 
@@ -464,7 +472,6 @@ public Builder incrementVersion(ClusterState clusterState) {
      * @param source     the source of the cluster state update task
      * @param updateTask the full context for the cluster state update
      *                   task
-     *
      */
     public <T extends ClusterStateTaskConfig & ClusterStateTaskExecutor<T> & ClusterStateTaskListener> void submitStateUpdateTask(
         String source,
@@ -490,7 +497,6 @@ public <T extends ClusterStateTaskConfig & ClusterStateTaskExecutor<T> & Cluster
      * @param listener callback after the cluster state update task
      *                 completes
      * @param <T>      the type of the cluster state update task state
-     *
      */
     public <T> void submitStateUpdateTask(
         String source,
@@ -947,7 +953,7 @@ void onNoLongerClusterManager() {
     /**
      * Functionality for register task key to cluster manager node.
      *
-     * @param taskKey - task key of task
+     * @param taskKey           - task key of task
      * @param throttlingEnabled - throttling is enabled for task or not i.e does data node perform retries on it or not
      * @return throttling task key which needs to be passed while submitting task to cluster manager
      */
@@ -966,7 +972,6 @@ public ClusterManagerTaskThrottler.ThrottlingKey registerClusterManagerTask(Stri
      *                 that share the same executor will be executed
      *                 batches on this executor
      * @param <T>      the type of the cluster state update task state
-     *
      */
     public <T> void submitStateUpdateTasks(
         final String source,
@@ -996,4 +1001,8 @@ public <T> void submitStateUpdateTasks(
         }
     }
 
+    public ClusterStateStats getClusterStateStats() {
+        return stateStats;
+    }
+
 }
diff --git a/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java b/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
index 665ecf77d7aa7..ea93ccd09ed39 100644
--- a/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
+++ b/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
@@ -32,8 +32,10 @@
 
 package org.opensearch.discovery;
 
+import org.opensearch.Version;
 import org.opensearch.cluster.coordination.PendingClusterStateStats;
 import org.opensearch.cluster.coordination.PublishClusterStateStats;
+import org.opensearch.cluster.service.ClusterStateStats;
 import org.opensearch.core.common.io.stream.StreamInput;
 import org.opensearch.core.common.io.stream.StreamOutput;
 import org.opensearch.core.common.io.stream.Writeable;
@@ -51,21 +53,31 @@ public class DiscoveryStats implements Writeable, ToXContentFragment {
 
     private final PendingClusterStateStats queueStats;
     private final PublishClusterStateStats publishStats;
+    private final ClusterStateStats clusterStateStats;
 
-    public DiscoveryStats(PendingClusterStateStats queueStats, PublishClusterStateStats publishStats) {
+    public DiscoveryStats(PendingClusterStateStats queueStats, PublishClusterStateStats publishStats, ClusterStateStats clusterStateStats) {
         this.queueStats = queueStats;
         this.publishStats = publishStats;
+        this.clusterStateStats = clusterStateStats;
     }
 
     public DiscoveryStats(StreamInput in) throws IOException {
         queueStats = in.readOptionalWriteable(PendingClusterStateStats::new);
         publishStats = in.readOptionalWriteable(PublishClusterStateStats::new);
+        if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+            clusterStateStats = in.readOptionalWriteable(ClusterStateStats::new);
+        } else {
+            clusterStateStats = null;
+        }
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeOptionalWriteable(queueStats);
         out.writeOptionalWriteable(publishStats);
+        if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+            out.writeOptionalWriteable(clusterStateStats);
+        }
     }
 
     @Override
@@ -77,6 +89,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         if (publishStats != null) {
             publishStats.toXContent(builder, params);
         }
+        if (clusterStateStats != null) {
+            clusterStateStats.toXContent(builder, params);
+        }
         builder.endObject();
         return builder;
     }
@@ -92,4 +107,8 @@ public PendingClusterStateStats getQueueStats() {
     public PublishClusterStateStats getPublishStats() {
         return publishStats;
     }
+
+    public ClusterStateStats getClusterStateStats() {
+        return clusterStateStats;
+    }
 }
diff --git a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
index f855449c708d2..350a361a49a62 100644
--- a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
+++ b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
@@ -47,6 +47,7 @@
 import org.opensearch.cluster.coordination.InMemoryPersistedState;
 import org.opensearch.cluster.coordination.PersistedStateRegistry;
 import org.opensearch.cluster.coordination.PersistedStateRegistry.PersistedStateType;
+import org.opensearch.cluster.coordination.PersistedStateStats;
 import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.cluster.metadata.IndexTemplateMetadata;
 import org.opensearch.cluster.metadata.Manifest;
@@ -615,6 +616,12 @@ public void setLastAcceptedState(ClusterState clusterState) {
             lastAcceptedState = clusterState;
         }
 
+        @Override
+        public PersistedStateStats getStats() {
+            // Note: These stats are not published yet, will come in future
+            return null;
+        }
+
         private PersistedClusterStateService.Writer getWriterSafe() {
             final PersistedClusterStateService.Writer writer = persistenceWriter.get();
             if (writer == null) {
@@ -717,10 +724,16 @@ assert verifyManifestAndClusterState(lastAcceptedManifest, lastAcceptedState) ==
                 lastAcceptedManifest = manifest;
                 lastAcceptedState = clusterState;
             } catch (Exception e) {
+                remoteClusterStateService.writeMetadataFailed();
                 handleExceptionOnWrite(e);
             }
         }
 
+        @Override
+        public PersistedStateStats getStats() {
+            return remoteClusterStateService.getStats();
+        }
+
         private boolean verifyManifestAndClusterState(ClusterMetadataManifest manifest, ClusterState clusterState) {
             assert manifest != null : "ClusterMetadataManifest is null";
             assert clusterState != null : "ClusterState is null";
diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index 025ad075d83b6..329ebd0dcd2b8 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -159,7 +159,7 @@ public class RemoteClusterStateService implements Closeable {
     private volatile TimeValue globalMetadataUploadTimeout;
 
     private final AtomicBoolean deleteStaleMetadataRunning = new AtomicBoolean(false);
-
+    private final RemotePersistenceStats remoteStateStats;
     public static final int INDEX_METADATA_CURRENT_CODEC_VERSION = 1;
     public static final int MANIFEST_CURRENT_CODEC_VERSION = ClusterMetadataManifest.CODEC_V1;
     public static final int GLOBAL_METADATA_CURRENT_CODEC_VERSION = 1;
@@ -193,6 +193,7 @@ public RemoteClusterStateService(
         clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
         clusterSettings.addSettingsUpdateConsumer(INDEX_METADATA_UPLOAD_TIMEOUT_SETTING, this::setIndexMetadataUploadTimeout);
         clusterSettings.addSettingsUpdateConsumer(GLOBAL_METADATA_UPLOAD_TIMEOUT_SETTING, this::setGlobalMetadataUploadTimeout);
+        this.remoteStateStats = new RemotePersistenceStats();
     }
 
     private BlobStoreTransferService getBlobStoreTransferService() {
@@ -233,6 +234,8 @@ public ClusterMetadataManifest writeFullMetadata(ClusterState clusterState, Stri
             false
         );
         final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
+        remoteStateStats.stateSucceeded();
+        remoteStateStats.stateTook(durationMillis);
         if (durationMillis >= slowWriteLoggingThreshold.getMillis()) {
             logger.warn(
                 "writing cluster state took [{}ms] which is above the warn threshold of [{}]; " + "wrote full state with [{}] indices",
@@ -334,6 +337,8 @@ public ClusterMetadataManifest writeIncrementalMetadata(
         deleteStaleClusterMetadata(clusterState.getClusterName().value(), clusterState.metadata().clusterUUID(), RETAINED_MANIFESTS);
 
         final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
+        remoteStateStats.stateSucceeded();
+        remoteStateStats.stateTook(durationMillis);
         if (durationMillis >= slowWriteLoggingThreshold.getMillis()) {
             logger.warn(
                 "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
@@ -1059,6 +1064,10 @@ public static String encodeString(String content) {
         return Base64.getUrlEncoder().withoutPadding().encodeToString(content.getBytes(StandardCharsets.UTF_8));
     }
 
+    public void writeMetadataFailed() {
+        getStats().stateFailed();
+    }
+
     /**
      * Exception for IndexMetadata transfer failures to remote
      */
@@ -1093,7 +1102,7 @@ public GlobalMetadataTransferException(String errorDesc, Throwable cause) {
      * @param clusterName name of the cluster
      * @param clusterUUIDs clusteUUIDs for which the remote state needs to be purged
      */
-    private void deleteStaleUUIDsClusterMetadata(String clusterName, List<String> clusterUUIDs) {
+    void deleteStaleUUIDsClusterMetadata(String clusterName, List<String> clusterUUIDs) {
         clusterUUIDs.forEach(clusterUUID -> {
             getBlobStoreTransferService().deleteAsync(
                 ThreadPool.Names.REMOTE_PURGE,
@@ -1113,6 +1122,7 @@ public void onFailure(Exception e) {
                             ),
                             e
                         );
+                        remoteStateStats.cleanUpAttemptFailed();
                     }
                 }
             );
@@ -1228,8 +1238,10 @@ private void deleteClusterMetadata(
             logger.error("Error while fetching Remote Cluster Metadata manifests", e);
         } catch (IOException e) {
             logger.error("Error while deleting stale Remote Cluster Metadata files", e);
+            remoteStateStats.cleanUpAttemptFailed();
         } catch (Exception e) {
             logger.error("Unexpected error while deleting stale Remote Cluster Metadata files", e);
+            remoteStateStats.cleanUpAttemptFailed();
         }
     }
 
@@ -1260,4 +1272,8 @@ public void deleteStaleClusterUUIDs(ClusterState clusterState, ClusterMetadataMa
             deleteStaleUUIDsClusterMetadata(clusterName, new ArrayList<>(allClustersUUIDsInRemote));
         });
     }
+
+    public RemotePersistenceStats getStats() {
+        return remoteStateStats;
+    }
 }
diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java b/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java
new file mode 100644
index 0000000000000..f2330846fa23e
--- /dev/null
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java
@@ -0,0 +1,37 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.gateway.remote;
+
+import org.opensearch.cluster.coordination.PersistedStateStats;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Remote state related extended stats.
+ *
+ * @opensearch.internal
+ */
+public class RemotePersistenceStats extends PersistedStateStats {
+    static final String CLEANUP_ATTEMPT_FAILED_COUNT = "cleanup_attempt_failed_count";
+    static final String REMOTE_UPLOAD = "remote_upload";
+    private AtomicLong cleanupAttemptFailedCount = new AtomicLong(0);
+
+    public RemotePersistenceStats() {
+        super(REMOTE_UPLOAD);
+        addToExtendedFields(CLEANUP_ATTEMPT_FAILED_COUNT, cleanupAttemptFailedCount);
+    }
+
+    public void cleanUpAttemptFailed() {
+        cleanupAttemptFailedCount.incrementAndGet();
+    }
+
+    public long getCleanupAttemptFailedCount() {
+        return cleanupAttemptFailedCount.get();
+    }
+}
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
index ebdd012006fb2..3050d1674a95b 100644
--- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
+++ b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
@@ -36,10 +36,12 @@
 import org.opensearch.action.admin.indices.stats.CommonStatsFlags;
 import org.opensearch.action.search.SearchRequestStats;
 import org.opensearch.cluster.coordination.PendingClusterStateStats;
+import org.opensearch.cluster.coordination.PersistedStateStats;
 import org.opensearch.cluster.coordination.PublishClusterStateStats;
 import org.opensearch.cluster.node.DiscoveryNode;
 import org.opensearch.cluster.routing.WeightedRoutingStats;
 import org.opensearch.cluster.service.ClusterManagerThrottlingStats;
+import org.opensearch.cluster.service.ClusterStateStats;
 import org.opensearch.common.io.stream.BytesStreamOutput;
 import org.opensearch.common.metrics.OperationStats;
 import org.opensearch.core.common.io.stream.StreamInput;
@@ -47,6 +49,7 @@
 import org.opensearch.core.indices.breaker.AllCircuitBreakerStats;
 import org.opensearch.core.indices.breaker.CircuitBreakerStats;
 import org.opensearch.discovery.DiscoveryStats;
+import org.opensearch.gateway.remote.RemotePersistenceStats;
 import org.opensearch.http.HttpStats;
 import org.opensearch.index.ReplicationStats;
 import org.opensearch.index.SegmentReplicationRejectionStats;
@@ -72,6 +75,7 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -349,6 +353,25 @@ public void testSerialization() throws IOException {
                         assertEquals(queueStats.getTotal(), deserializedDiscoveryStats.getQueueStats().getTotal());
                         assertEquals(queueStats.getPending(), deserializedDiscoveryStats.getQueueStats().getPending());
                     }
+                    ClusterStateStats stateStats = discoveryStats.getClusterStateStats();
+                    if (stateStats == null) {
+                        assertNull(deserializedDiscoveryStats.getClusterStateStats());
+                    } else {
+                        assertEquals(stateStats.getUpdateFailed(), deserializedDiscoveryStats.getClusterStateStats().getUpdateFailed());
+                        assertEquals(stateStats.getUpdateSuccess(), deserializedDiscoveryStats.getClusterStateStats().getUpdateSuccess());
+                        assertEquals(
+                            stateStats.getUpdateTotalTimeInMillis(),
+                            deserializedDiscoveryStats.getClusterStateStats().getUpdateTotalTimeInMillis()
+                        );
+                        assertEquals(1, deserializedDiscoveryStats.getClusterStateStats().getPersistenceStats().size());
+                        PersistedStateStats deserializedRemoteStateStats = deserializedDiscoveryStats.getClusterStateStats()
+                            .getPersistenceStats()
+                            .get(0);
+                        PersistedStateStats remoteStateStats = stateStats.getPersistenceStats().get(0);
+                        assertEquals(remoteStateStats.getFailedCount(), deserializedRemoteStateStats.getFailedCount());
+                        assertEquals(remoteStateStats.getSuccessCount(), deserializedRemoteStateStats.getSuccessCount());
+                        assertEquals(remoteStateStats.getTotalTimeInMillis(), deserializedRemoteStateStats.getTotalTimeInMillis());
+                    }
                 }
                 IngestStats ingestStats = nodeStats.getIngestStats();
                 IngestStats deserializedIngestStats = deserializedNodeStats.getIngestStats();
@@ -725,12 +748,16 @@ public static NodeStats createNodeStats(boolean remoteStoreStats) {
         ScriptStats scriptStats = frequently()
             ? new ScriptStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong())
             : null;
+        ClusterStateStats stateStats = new ClusterStateStats();
+        RemotePersistenceStats remoteStateStats = new RemotePersistenceStats();
+        stateStats.setPersistenceStats(Arrays.asList(remoteStateStats));
         DiscoveryStats discoveryStats = frequently()
             ? new DiscoveryStats(
                 randomBoolean() ? new PendingClusterStateStats(randomInt(), randomInt(), randomInt()) : null,
                 randomBoolean()
                     ? new PublishClusterStateStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong())
-                    : null
+                    : null,
+                randomBoolean() ? stateStats : null
             )
             : null;
         IngestStats ingestStats = null;
diff --git a/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java b/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
index 9cdbe04e0a0e4..4c0ca826f5dcc 100644
--- a/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
+++ b/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
@@ -691,6 +691,9 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
                     submittedTasksPerThread.get(entry.getKey()).get()
                 );
             }
+            // verify stats values after state is published
+            assertEquals(1, clusterManagerService.getClusterStateStats().getUpdateSuccess());
+            assertEquals(0, clusterManagerService.getClusterStateStats().getUpdateFailed());
         }
     }
 
diff --git a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
index 1d5c2a0f01b5c..fd113ed4313d7 100644
--- a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
+++ b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
@@ -68,6 +68,7 @@
 import org.opensearch.gateway.PersistedClusterStateService.Writer;
 import org.opensearch.gateway.remote.ClusterMetadataManifest;
 import org.opensearch.gateway.remote.RemoteClusterStateService;
+import org.opensearch.gateway.remote.RemotePersistenceStats;
 import org.opensearch.index.recovery.RemoteStoreRestoreService;
 import org.opensearch.index.recovery.RemoteStoreRestoreService.RemoteRestoreResult;
 import org.opensearch.node.Node;
@@ -104,6 +105,7 @@
 import static org.hamcrest.Matchers.nullValue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyBoolean;
+import static org.mockito.Mockito.doCallRealMethod;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
@@ -779,6 +781,26 @@ public void testRemotePersistedStateExceptionOnFullStateUpload() throws IOExcept
         assertThrows(OpenSearchException.class, () -> remotePersistedState.setLastAcceptedState(clusterState));
     }
 
+    public void testRemotePersistedStateFailureStats() throws IOException {
+        RemotePersistenceStats remoteStateStats = new RemotePersistenceStats();
+        final RemoteClusterStateService remoteClusterStateService = Mockito.mock(RemoteClusterStateService.class);
+        final String previousClusterUUID = "prev-cluster-uuid";
+        Mockito.doThrow(IOException.class).when(remoteClusterStateService).writeFullMetadata(Mockito.any(), Mockito.any());
+        when(remoteClusterStateService.getStats()).thenReturn(remoteStateStats);
+        doCallRealMethod().when(remoteClusterStateService).writeMetadataFailed();
+        CoordinationState.PersistedState remotePersistedState = new RemotePersistedState(remoteClusterStateService, previousClusterUUID);
+
+        final long clusterTerm = randomNonNegativeLong();
+        final ClusterState clusterState = createClusterState(
+            randomNonNegativeLong(),
+            Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(clusterTerm).build()).build()
+        );
+
+        assertThrows(OpenSearchException.class, () -> remotePersistedState.setLastAcceptedState(clusterState));
+        assertEquals(1, remoteClusterStateService.getStats().getFailedCount());
+        assertEquals(0, remoteClusterStateService.getStats().getSuccessCount());
+    }
+
     public void testGatewayForRemoteState() throws IOException {
         MockGatewayMetaState gateway = null;
         try {
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index 173e15b8eca37..5a43864f40c0c 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -324,6 +324,7 @@ public void testWriteFullMetadataInParallelFailureForIndexMetadata() throws IOEx
             RemoteClusterStateService.IndexMetadataTransferException.class,
             () -> remoteClusterStateService.writeFullMetadata(clusterState, randomAlphaOfLength(10))
         );
+        assertEquals(0, remoteClusterStateService.getStats().getSuccessCount());
     }
 
     public void testFailWriteIncrementalMetadataNonClusterManagerNode() throws IOException {
@@ -331,6 +332,7 @@ public void testFailWriteIncrementalMetadataNonClusterManagerNode() throws IOExc
         remoteClusterStateService.start();
         final ClusterMetadataManifest manifest = remoteClusterStateService.writeIncrementalMetadata(clusterState, clusterState, null);
         Assert.assertThat(manifest, nullValue());
+        assertEquals(0, remoteClusterStateService.getStats().getSuccessCount());
     }
 
     public void testFailWriteIncrementalMetadataWhenTermChanged() {
@@ -991,6 +993,38 @@ public void testDeleteStaleClusterUUIDs() throws IOException {
         }
     }
 
+    public void testRemoteStateStats() throws IOException {
+        final ClusterState clusterState = generateClusterStateWithOneIndex().nodes(nodesWithLocalNodeClusterManager()).build();
+        mockBlobStoreObjects();
+        remoteClusterStateService.start();
+        final ClusterMetadataManifest manifest = remoteClusterStateService.writeFullMetadata(clusterState, "prev-cluster-uuid");
+
+        assertTrue(remoteClusterStateService.getStats() != null);
+        assertEquals(1, remoteClusterStateService.getStats().getSuccessCount());
+        assertEquals(0, remoteClusterStateService.getStats().getCleanupAttemptFailedCount());
+        assertEquals(0, remoteClusterStateService.getStats().getFailedCount());
+    }
+
+    public void testRemoteStateCleanupFailureStats() throws IOException {
+        BlobContainer blobContainer = mock(BlobContainer.class);
+        doThrow(IOException.class).when(blobContainer).delete();
+        when(blobStore.blobContainer(any())).thenReturn(blobContainer);
+        BlobPath blobPath = new BlobPath().add("random-path");
+        when((blobStoreRepository.basePath())).thenReturn(blobPath);
+        remoteClusterStateService.start();
+        remoteClusterStateService.deleteStaleUUIDsClusterMetadata("cluster1", Arrays.asList("cluster-uuid1"));
+        try {
+            assertBusy(() -> {
+                // wait for stats to get updated
+                assertTrue(remoteClusterStateService.getStats() != null);
+                assertEquals(0, remoteClusterStateService.getStats().getSuccessCount());
+                assertEquals(1, remoteClusterStateService.getStats().getCleanupAttemptFailedCount());
+            });
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
     public void testFileNames() {
         final Index index = new Index("test-index", "index-uuid");
         final Settings idxSettings = Settings.builder()
diff --git a/test/framework/src/main/java/org/opensearch/cluster/coordination/AbstractCoordinatorTestCase.java b/test/framework/src/main/java/org/opensearch/cluster/coordination/AbstractCoordinatorTestCase.java
index d24cc24d28579..28d7706fb1493 100644
--- a/test/framework/src/main/java/org/opensearch/cluster/coordination/AbstractCoordinatorTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/cluster/coordination/AbstractCoordinatorTestCase.java
@@ -1016,6 +1016,11 @@ public void setLastAcceptedState(ClusterState clusterState) {
                 delegate.setLastAcceptedState(clusterState);
             }
 
+            @Override
+            public PersistedStateStats getStats() {
+                return null;
+            }
+
             @Override
             public void close() {
                 assertTrue(openPersistedStates.remove(this));

From 6f36752d9e84e95ce2280347cc26b0c9138b2d57 Mon Sep 17 00:00:00 2001
From: Sachin Kale <sachinpkale@gmail.com>
Date: Tue, 24 Oct 2023 14:31:01 +0530
Subject: [PATCH 06/33] Sync translog to remote on primary activate (#10839)

---------

Signed-off-by: Sachin Kale <kalsac@amazon.com>
Co-authored-by: Sachin Kale <kalsac@amazon.com>
---
 .../remotestore/RemoteRestoreSnapshotIT.java  | 94 +++++++++++++++++++
 .../remotestore/RemoteStoreStatsIT.java       |  8 +-
 .../opensearch/index/shard/IndexShard.java    | 19 +++-
 .../index/shard/IndexShardTests.java          |  1 +
 4 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
index 865b2d13f189e..9e0b2a66467de 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
@@ -11,6 +11,7 @@
 import org.opensearch.action.DocWriteResponse;
 import org.opensearch.action.admin.cluster.remotestore.restore.RestoreRemoteStoreRequest;
 import org.opensearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
+import org.opensearch.action.admin.indices.delete.DeleteIndexRequest;
 import org.opensearch.action.admin.indices.get.GetIndexRequest;
 import org.opensearch.action.admin.indices.get.GetIndexResponse;
 import org.opensearch.action.delete.DeleteResponse;
@@ -20,8 +21,13 @@
 import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.common.io.PathUtils;
 import org.opensearch.common.settings.Settings;
+import org.opensearch.common.util.io.IOUtils;
+import org.opensearch.core.index.Index;
 import org.opensearch.core.rest.RestStatus;
+import org.opensearch.index.IndexService;
 import org.opensearch.index.IndexSettings;
+import org.opensearch.index.shard.IndexShard;
+import org.opensearch.indices.IndicesService;
 import org.opensearch.indices.replication.common.ReplicationType;
 import org.opensearch.snapshots.AbstractSnapshotIntegTestCase;
 import org.opensearch.snapshots.SnapshotInfo;
@@ -32,11 +38,15 @@
 import org.junit.Before;
 
 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Optional;
 import java.util.concurrent.ExecutionException;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REMOTE_SEGMENT_STORE_REPOSITORY;
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REMOTE_STORE_ENABLED;
@@ -345,6 +355,90 @@ public void testRestoreInSameRemoteStoreEnabledIndex() throws IOException {
         assertDocsPresentInIndex(client, indexName1, numDocsInIndex1 + 4);
     }
 
+    public void testRemoteRestoreIndexRestoredFromSnapshot() throws IOException, ExecutionException, InterruptedException {
+        internalCluster().startClusterManagerOnlyNode();
+        internalCluster().startDataOnlyNodes(2);
+
+        String indexName1 = "testindex1";
+        String snapshotRepoName = "test-restore-snapshot-repo";
+        String snapshotName1 = "test-restore-snapshot1";
+        Path absolutePath1 = randomRepoPath().toAbsolutePath();
+        logger.info("Snapshot Path [{}]", absolutePath1);
+
+        createRepository(snapshotRepoName, "fs", getRepositorySettings(absolutePath1, true));
+
+        Settings indexSettings = getIndexSettings(1, 0).build();
+        createIndex(indexName1, indexSettings);
+
+        final int numDocsInIndex1 = randomIntBetween(20, 30);
+        indexDocuments(client(), indexName1, numDocsInIndex1);
+        flushAndRefresh(indexName1);
+        ensureGreen(indexName1);
+
+        logger.info("--> snapshot");
+        SnapshotInfo snapshotInfo1 = createSnapshot(snapshotRepoName, snapshotName1, new ArrayList<>(Arrays.asList(indexName1)));
+        assertThat(snapshotInfo1.successfulShards(), greaterThan(0));
+        assertThat(snapshotInfo1.successfulShards(), equalTo(snapshotInfo1.totalShards()));
+        assertThat(snapshotInfo1.state(), equalTo(SnapshotState.SUCCESS));
+
+        assertAcked(client().admin().indices().delete(new DeleteIndexRequest(indexName1)).get());
+        assertFalse(indexExists(indexName1));
+
+        RestoreSnapshotResponse restoreSnapshotResponse1 = client().admin()
+            .cluster()
+            .prepareRestoreSnapshot(snapshotRepoName, snapshotName1)
+            .setWaitForCompletion(false)
+            .setIndices(indexName1)
+            .get();
+
+        assertEquals(restoreSnapshotResponse1.status(), RestStatus.ACCEPTED);
+        ensureGreen(indexName1);
+        assertDocsPresentInIndex(client(), indexName1, numDocsInIndex1);
+
+        // Make sure remote translog is empty
+        String indexUUID = client().admin()
+            .indices()
+            .prepareGetSettings(indexName1)
+            .get()
+            .getSetting(indexName1, IndexMetadata.SETTING_INDEX_UUID);
+
+        Path remoteTranslogMetadataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/metadata");
+        Path remoteTranslogDataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/data");
+
+        try (
+            Stream<Path> translogMetadata = Files.list(remoteTranslogMetadataPath);
+            Stream<Path> translogData = Files.list(remoteTranslogDataPath)
+        ) {
+            assertTrue(translogData.count() > 0);
+            assertTrue(translogMetadata.count() > 0);
+        }
+
+        // Clear the local data before stopping the node. This will make sure that remote translog is empty.
+        IndexShard indexShard = getIndexShard(primaryNodeName(indexName1), indexName1);
+        try (Stream<Path> files = Files.list(indexShard.shardPath().resolveTranslog())) {
+            IOUtils.deleteFilesIgnoringExceptions(files.collect(Collectors.toList()));
+        }
+        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primaryNodeName(indexName1)));
+
+        ensureRed(indexName1);
+
+        client().admin()
+            .cluster()
+            .restoreRemoteStore(new RestoreRemoteStoreRequest().indices(indexName1).restoreAllShards(false), PlainActionFuture.newFuture());
+
+        ensureGreen(indexName1);
+        assertDocsPresentInIndex(client(), indexName1, numDocsInIndex1);
+    }
+
+    protected IndexShard getIndexShard(String node, String indexName) {
+        final Index index = resolveIndex(indexName);
+        IndicesService indicesService = internalCluster().getInstance(IndicesService.class, node);
+        IndexService indexService = indicesService.indexService(index);
+        assertNotNull(indexService);
+        final Optional<Integer> shardId = indexService.shardIds().stream().findFirst();
+        return shardId.map(indexService::getShard).orElse(null);
+    }
+
     public void testRestoreShallowCopySnapshotWithDifferentRepo() throws IOException {
         String clusterManagerNode = internalCluster().startClusterManagerOnlyNode();
         String primary = internalCluster().startDataOnlyNode();
diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
index 5e91176ed0473..b1dbb0a900bc7 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
@@ -581,21 +581,23 @@ public void testNonZeroPrimaryStatsOnNewlyCreatedIndexWithZeroDocs() throws Exce
                 .getRemoteStoreStats();
             Arrays.stream(remoteStoreStats).forEach(statObject -> {
                 RemoteSegmentTransferTracker.Stats segmentStats = statObject.getSegmentStats();
+                RemoteTranslogTransferTracker.Stats translogStats = statObject.getTranslogStats();
                 if (statObject.getShardRouting().primary()) {
                     assertTrue(
                         segmentStats.totalUploadsSucceeded == 1
                             && segmentStats.totalUploadsStarted == segmentStats.totalUploadsSucceeded
                             && segmentStats.totalUploadsFailed == 0
                     );
+                    // On primary shard creation, we upload to remote translog post primary mode activation.
+                    // This changes upload stats to non-zero for primary shard.
+                    assertNonZeroTranslogUploadStatsNoFailures(translogStats);
                 } else {
                     assertTrue(
                         segmentStats.directoryFileTransferTrackerStats.transferredBytesStarted == 0
                             && segmentStats.directoryFileTransferTrackerStats.transferredBytesSucceeded == 0
                     );
+                    assertZeroTranslogUploadStats(translogStats);
                 }
-
-                RemoteTranslogTransferTracker.Stats translogStats = statObject.getTranslogStats();
-                assertZeroTranslogUploadStats(translogStats);
                 assertZeroTranslogDownloadStats(translogStats);
             });
         }, 5, TimeUnit.SECONDS);
diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
index fb4e9056153aa..3c348035ebbdd 100644
--- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java
+++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
@@ -640,7 +640,7 @@ public void updateShardState(
                     if (currentRouting.initializing() && currentRouting.isRelocationTarget() == false && newRouting.active()) {
                         // the cluster-manager started a recovering primary, activate primary mode.
                         replicationTracker.activatePrimaryMode(getLocalCheckpoint());
-                        ensurePeerRecoveryRetentionLeasesExist();
+                        postActivatePrimaryMode();
                     }
                 } else {
                     assert currentRouting.primary() == false : "term is only increased as part of primary promotion";
@@ -711,8 +711,7 @@ public void updateShardState(
                                 // are brought up to date.
                                 checkpointPublisher.publish(this, getLatestReplicationCheckpoint());
                             }
-
-                            ensurePeerRecoveryRetentionLeasesExist();
+                            postActivatePrimaryMode();
                             /*
                              * If this shard was serving as a replica shard when another shard was promoted to primary then
                              * its Lucene index was reset during the primary term transition. In particular, the Lucene index
@@ -3393,6 +3392,20 @@ assert getLocalCheckpoint() == primaryContext.getCheckpointStates().get(routingE
         synchronized (mutex) {
             replicationTracker.activateWithPrimaryContext(primaryContext); // make changes to primaryMode flag only under mutex
         }
+        postActivatePrimaryMode();
+    }
+
+    private void postActivatePrimaryMode() {
+        if (indexSettings.isRemoteStoreEnabled()) {
+            // We make sure to upload translog (even if it does not contain any operations) to remote translog.
+            // This helps to get a consistent state in remote store where both remote segment store and remote
+            // translog contains data.
+            try {
+                getEngine().translogManager().syncTranslog();
+            } catch (IOException e) {
+                logger.error("Failed to sync translog to remote from new primary", e);
+            }
+        }
         ensurePeerRecoveryRetentionLeasesExist();
     }
 
diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
index 9ef9bec01cb38..fa3cf7676f55c 100644
--- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
@@ -2745,6 +2745,7 @@ public void testRelocatedForRemoteTranslogBackedIndexWithAsyncDurability() throw
             AllocationId.newRelocation(routing.allocationId())
         );
         IndexShardTestCase.updateRoutingEntry(indexShard, routing);
+        indexDoc(indexShard, "_doc", "0");
         assertTrue(indexShard.isSyncNeeded());
         try {
             indexShard.relocated(routing.getTargetRelocatingShard().allocationId().getId(), primaryContext -> {}, () -> {});

From 5bd413c588f48589c6fd6c4de4e87550271aecf8 Mon Sep 17 00:00:00 2001
From: Peter Nied <petern@amazon.com>
Date: Tue, 24 Oct 2023 07:38:18 -0400
Subject: [PATCH 07/33] GHA to verify checklist items completion in PR
 descriptions (#10800)

Signed-off-by: Peter Nied <petern@amazon.com>
---
 .github/workflows/pull-request-checks.yml | 28 +++++++++++++++++++++++
 CHANGELOG.md                              |  1 +
 2 files changed, 29 insertions(+)
 create mode 100644 .github/workflows/pull-request-checks.yml

diff --git a/.github/workflows/pull-request-checks.yml b/.github/workflows/pull-request-checks.yml
new file mode 100644
index 0000000000000..11998e36c2dbb
--- /dev/null
+++ b/.github/workflows/pull-request-checks.yml
@@ -0,0 +1,28 @@
+name: Pull Request Checks
+
+on:
+  pull_request:
+    types:
+      [
+        opened,
+        edited,
+        review_requested,
+        synchronize,
+        reopened,
+        ready_for_review,
+      ]
+
+jobs:
+  verify-description-checklist:
+    name: Verify Description Checklist
+    runs-on: ubuntu-latest
+    steps:
+      - uses: peternied/check-pull-request-description-checklist@v1
+        with:
+          checklist-items: |
+            New functionality includes testing.
+            All tests pass
+            New functionality has been documented.
+            New functionality has javadoc added
+            Commits are signed per the DCO using --signoff
+            Commit changes are listed out in CHANGELOG.md file (See: [Changelog](../blob/main/CONTRIBUTING.md#changelog))
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c7e3ee151d64..b40878066960a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Remote cluster state] Download functionality of global metadata from remote store ([#10535](https://github.com/opensearch-project/OpenSearch/pull/10535))
 - [Remote cluster state] Restore global metadata from remote store when local state is lost after quorum loss ([#10404](https://github.com/opensearch-project/OpenSearch/pull/10404))
 - [AdmissionControl] Added changes for AdmissionControl Interceptor and AdmissionControlService for RateLimiting ([#9286](https://github.com/opensearch-project/OpenSearch/pull/9286))
+- GHA to verify checklist items completion in PR descriptions ([#10800](https://github.com/opensearch-project/OpenSearch/pull/10800))
 
 ### Dependencies
 - Bump `log4j-core` from 2.18.0 to 2.19.0

From 91ac0846dbeb7379ae0772bf9f144cd628e9deac Mon Sep 17 00:00:00 2001
From: Varun Bansal <bansvaru@amazon.com>
Date: Wed, 25 Oct 2023 14:01:27 +0530
Subject: [PATCH 08/33] link previous cluster uuid to current cluster uuid even
 if current cluster uuid is not committed (#10832)

* link previous cluster uuid to current cluster uuid even if current cluster uuid is not committed

Signed-off-by: bansvaru <bansvaru@amazon.com>
---
 .../opensearch/gateway/GatewayMetaState.java  | 29 +++++++-------
 .../GatewayMetaStatePersistedStateTests.java  | 39 +++++++++++++++++++
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
index 350a361a49a62..c3056276706a0 100644
--- a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
+++ b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java
@@ -695,24 +695,21 @@ public void setLastAcceptedState(ClusterState clusterState) {
             try {
                 final ClusterMetadataManifest manifest;
                 if (shouldWriteFullClusterState(clusterState)) {
-                    if (clusterState.metadata().clusterUUIDCommitted() == true) {
-                        final Optional<ClusterMetadataManifest> latestManifest = remoteClusterStateService.getLatestClusterMetadataManifest(
-                            clusterState.getClusterName().value(),
+                    final Optional<ClusterMetadataManifest> latestManifest = remoteClusterStateService.getLatestClusterMetadataManifest(
+                        clusterState.getClusterName().value(),
+                        clusterState.metadata().clusterUUID()
+                    );
+                    if (latestManifest.isPresent()) {
+                        // The previous UUID should not change for the current UUID. So fetching the latest manifest
+                        // from remote store and getting the previous UUID.
+                        previousClusterUUID = latestManifest.get().getPreviousClusterUUID();
+                    } else {
+                        // When the user starts the cluster with remote state disabled but later enables the remote state,
+                        // there will not be any manifest for the current cluster UUID.
+                        logger.error(
+                            "Latest manifest is not present in remote store for cluster UUID: {}",
                             clusterState.metadata().clusterUUID()
                         );
-                        if (latestManifest.isPresent()) {
-                            // The previous UUID should not change for the current UUID. So fetching the latest manifest
-                            // from remote store and getting the previous UUID.
-                            previousClusterUUID = latestManifest.get().getPreviousClusterUUID();
-                        } else {
-                            // When the user starts the cluster with remote state disabled but later enables the remote state,
-                            // there will not be any manifest for the current cluster UUID.
-                            logger.error(
-                                "Latest manifest is not present in remote store for cluster UUID: {}",
-                                clusterState.metadata().clusterUUID()
-                            );
-                            previousClusterUUID = ClusterState.UNKNOWN_UUID;
-                        }
                     }
                     manifest = remoteClusterStateService.writeFullMetadata(clusterState, previousClusterUUID);
                 } else {
diff --git a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
index fd113ed4313d7..74bae7b5eb7cf 100644
--- a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
+++ b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java
@@ -87,10 +87,12 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
 
+import org.mockito.ArgumentCaptor;
 import org.mockito.Mockito;
 
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_INDEX_UUID;
@@ -765,6 +767,43 @@ public void testRemotePersistedState() throws IOException {
         assertThat(remotePersistedState.getLastAcceptedState().metadata().clusterUUIDCommitted(), equalTo(true));
     }
 
+    public void testRemotePersistedStateNotCommitted() throws IOException {
+        final RemoteClusterStateService remoteClusterStateService = Mockito.mock(RemoteClusterStateService.class);
+        final String previousClusterUUID = "prev-cluster-uuid";
+        final ClusterMetadataManifest manifest = ClusterMetadataManifest.builder()
+            .previousClusterUUID(previousClusterUUID)
+            .clusterTerm(1L)
+            .stateVersion(5L)
+            .build();
+        Mockito.when(remoteClusterStateService.getLatestClusterMetadataManifest(Mockito.any(), Mockito.any()))
+            .thenReturn(Optional.of(manifest));
+        Mockito.when(remoteClusterStateService.writeFullMetadata(Mockito.any(), Mockito.any())).thenReturn(manifest);
+
+        Mockito.when(remoteClusterStateService.writeIncrementalMetadata(Mockito.any(), Mockito.any(), Mockito.any())).thenReturn(manifest);
+        CoordinationState.PersistedState remotePersistedState = new RemotePersistedState(
+            remoteClusterStateService,
+            ClusterState.UNKNOWN_UUID
+        );
+
+        assertThat(remotePersistedState.getLastAcceptedState(), nullValue());
+        assertThat(remotePersistedState.getCurrentTerm(), equalTo(0L));
+
+        final long clusterTerm = randomNonNegativeLong();
+        ClusterState clusterState = createClusterState(
+            randomNonNegativeLong(),
+            Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(clusterTerm).build()).build()
+        );
+        clusterState = ClusterState.builder(clusterState)
+            .metadata(Metadata.builder(clusterState.getMetadata()).clusterUUID(randomAlphaOfLength(10)).clusterUUIDCommitted(false).build())
+            .build();
+
+        remotePersistedState.setLastAcceptedState(clusterState);
+        ArgumentCaptor<String> previousClusterUUIDCaptor = ArgumentCaptor.forClass(String.class);
+        ArgumentCaptor<ClusterState> clusterStateCaptor = ArgumentCaptor.forClass(ClusterState.class);
+        Mockito.verify(remoteClusterStateService).writeFullMetadata(clusterStateCaptor.capture(), previousClusterUUIDCaptor.capture());
+        assertEquals(previousClusterUUID, previousClusterUUIDCaptor.getValue());
+    }
+
     public void testRemotePersistedStateExceptionOnFullStateUpload() throws IOException {
         final RemoteClusterStateService remoteClusterStateService = Mockito.mock(RemoteClusterStateService.class);
         final String previousClusterUUID = "prev-cluster-uuid";

From b5299f13e0ca9a5f6979e8cb50137682e777b095 Mon Sep 17 00:00:00 2001
From: Sachin Kale <sachinpkale@gmail.com>
Date: Wed, 25 Oct 2023 14:46:44 +0530
Subject: [PATCH 09/33] Delete corrupted file to re-download from remote store
 (#10891)

---------

Signed-off-by: Sachin Kale <kalsac@amazon.com>
Co-authored-by: Sachin Kale <kalsac@amazon.com>
---
 .../opensearch/index/shard/IndexShard.java    |  5 +-
 .../index/shard/IndexShardTests.java          | 52 +++++++++++++++++++
 .../org/opensearch/test/CorruptionUtils.java  |  2 +-
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
index 3c348035ebbdd..5b6257084e440 100644
--- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java
+++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
@@ -4962,7 +4962,8 @@ private String copySegmentFiles(
         return segmentNFile;
     }
 
-    private boolean localDirectoryContains(Directory localDirectory, String file, long checksum) {
+    // Visible for testing
+    boolean localDirectoryContains(Directory localDirectory, String file, long checksum) throws IOException {
         try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
             if (checksum == CodecUtil.retrieveChecksum(indexInput)) {
                 return true;
@@ -4981,6 +4982,8 @@ private boolean localDirectoryContains(Directory localDirectory, String file, lo
             logger.debug("File {} does not exist in local FS, downloading from remote store", file);
         } catch (IOException e) {
             logger.warn("Exception while reading checksum of file: {}, this can happen if file is corrupted", file);
+            // For any other exception on reading checksum, we delete the file to re-download again
+            localDirectory.deleteFile(file);
         }
         return false;
     }
diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
index fa3cf7676f55c..f5f8cd1dcfb3f 100644
--- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
@@ -32,6 +32,7 @@
 package org.opensearch.index.shard;
 
 import org.apache.logging.log4j.Logger;
+import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexCommit;
@@ -45,6 +46,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FilterDirectory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.tests.mockfile.ExtrasFS;
 import org.apache.lucene.tests.store.BaseDirectoryWrapper;
 import org.apache.lucene.util.BytesRef;
@@ -91,6 +93,7 @@
 import org.opensearch.core.common.io.stream.StreamInput;
 import org.opensearch.core.index.shard.ShardId;
 import org.opensearch.core.indices.breaker.NoneCircuitBreakerService;
+import org.opensearch.core.util.FileSystemUtils;
 import org.opensearch.core.xcontent.MediaTypeRegistry;
 import org.opensearch.core.xcontent.NamedXContentRegistry;
 import org.opensearch.core.xcontent.XContentBuilder;
@@ -163,11 +166,13 @@
 import org.junit.Assert;
 
 import java.io.IOException;
+import java.nio.channels.FileChannel;
 import java.nio.charset.Charset;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.SimpleFileVisitor;
+import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -4907,6 +4912,53 @@ public void testRecordsForceMerges() throws IOException {
         closeShards(shard);
     }
 
+    public void testLocalDirectoryContains() throws IOException {
+        IndexShard indexShard = newStartedShard(true);
+        int numDocs = between(1, 10);
+        for (int i = 0; i < numDocs; i++) {
+            indexDoc(indexShard, "_doc", Integer.toString(i));
+        }
+        flushShard(indexShard);
+        indexShard.store().incRef();
+        Directory localDirectory = indexShard.store().directory();
+        Path shardPath = indexShard.shardPath().getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
+        Path tempDir = createTempDir();
+        for (String file : localDirectory.listAll()) {
+            if (file.equals("write.lock") || file.startsWith("extra")) {
+                continue;
+            }
+            boolean corrupted = randomBoolean();
+            long checksum = 0;
+            try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
+                checksum = CodecUtil.retrieveChecksum(indexInput);
+            }
+            if (corrupted) {
+                Files.copy(shardPath.resolve(file), tempDir.resolve(file));
+                try (FileChannel raf = FileChannel.open(shardPath.resolve(file), StandardOpenOption.READ, StandardOpenOption.WRITE)) {
+                    CorruptionUtils.corruptAt(shardPath.resolve(file), raf, (int) (raf.size() - 8));
+                }
+            }
+            if (corrupted == false) {
+                assertTrue(indexShard.localDirectoryContains(localDirectory, file, checksum));
+            } else {
+                assertFalse(indexShard.localDirectoryContains(localDirectory, file, checksum));
+                assertFalse(Files.exists(shardPath.resolve(file)));
+            }
+        }
+        try (Stream<Path> files = Files.list(tempDir)) {
+            files.forEach(p -> {
+                try {
+                    Files.copy(p, shardPath.resolve(p.getFileName()));
+                } catch (IOException e) {
+                    // Ignore
+                }
+            });
+        }
+        FileSystemUtils.deleteSubDirectories(tempDir);
+        indexShard.store().decRef();
+        closeShards(indexShard);
+    }
+
     private void populateSampleRemoteSegmentStats(RemoteSegmentTransferTracker tracker) {
         tracker.addUploadBytesStarted(30L);
         tracker.addUploadBytesSucceeded(10L);
diff --git a/test/framework/src/main/java/org/opensearch/test/CorruptionUtils.java b/test/framework/src/main/java/org/opensearch/test/CorruptionUtils.java
index 0dce5e78bf91f..67522bb618cf1 100644
--- a/test/framework/src/main/java/org/opensearch/test/CorruptionUtils.java
+++ b/test/framework/src/main/java/org/opensearch/test/CorruptionUtils.java
@@ -121,7 +121,7 @@ public static void corruptFile(Random random, Path... files) throws IOException
         }
     }
 
-    static void corruptAt(Path path, FileChannel channel, int position) throws IOException {
+    public static void corruptAt(Path path, FileChannel channel, int position) throws IOException {
         // read
         channel.position(position);
         long filePointer = channel.position();

From a890e518aea1a706249001c3420c1740bd1a06dd Mon Sep 17 00:00:00 2001
From: Dhwanil Patel <dhwanip@amazon.com>
Date: Wed, 25 Oct 2023 20:05:07 +0530
Subject: [PATCH 10/33] Fix exception handling for global metadata upload
 (#10889)

Signed-off-by: Dhwanil Patel <dhwanip@amazon.com>
---
 .../gateway/remote/RemoteClusterStateService.java         | 8 ++++++--
 .../gateway/remote/RemoteClusterStateServiceTests.java    | 8 +++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index 329ebd0dcd2b8..57b1b972e08c0 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -369,6 +369,8 @@ public ClusterMetadataManifest writeIncrementalMetadata(
     private String writeGlobalMetadata(ClusterState clusterState) throws IOException {
 
         AtomicReference<String> result = new AtomicReference<String>();
+        AtomicReference<Exception> exceptionReference = new AtomicReference<Exception>();
+
         final BlobContainer globalMetadataContainer = globalMetadataContainer(
             clusterState.getClusterName().value(),
             clusterState.metadata().clusterUUID()
@@ -381,7 +383,7 @@ private String writeGlobalMetadata(ClusterState clusterState) throws IOException
         LatchedActionListener completionListener = new LatchedActionListener<>(ActionListener.wrap(resp -> {
             logger.trace(String.format(Locale.ROOT, "GlobalMetadata uploaded successfully."));
             result.set(globalMetadataContainer.path().buildAsString() + globalMetadataFilename);
-        }, ex -> { throw new GlobalMetadataTransferException(ex.getMessage(), ex); }), latch);
+        }, ex -> { exceptionReference.set(ex); }), latch);
 
         GLOBAL_METADATA_FORMAT.writeAsyncWithUrgentPriority(
             clusterState.metadata(),
@@ -408,7 +410,9 @@ private String writeGlobalMetadata(ClusterState clusterState) throws IOException
             Thread.currentThread().interrupt();
             throw exception;
         }
-
+        if (exceptionReference.get() != null) {
+            throw new GlobalMetadataTransferException(exceptionReference.get().getMessage(), exceptionReference.get());
+        }
         return result.get();
     }
 
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index 5a43864f40c0c..ca88653f529f6 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -294,7 +294,13 @@ public void testWriteFullMetadataFailureForGlobalMetadata() throws IOException {
         ArgumentCaptor<ActionListener<Void>> actionListenerArgumentCaptor = ArgumentCaptor.forClass(ActionListener.class);
 
         doAnswer((i) -> {
-            actionListenerArgumentCaptor.getValue().onFailure(new RuntimeException("Cannot upload to remote"));
+            // For async write action listener will be called from different thread, replicating same behaviour here.
+            new Thread(new Runnable() {
+                @Override
+                public void run() {
+                    actionListenerArgumentCaptor.getValue().onFailure(new RuntimeException("Cannot upload to remote"));
+                }
+            }).start();
             return null;
         }).when(container).asyncBlobUpload(any(WriteContext.class), actionListenerArgumentCaptor.capture());
 

From 3e64a7b92192a37546b92b811e04c4155dafbdfd Mon Sep 17 00:00:00 2001
From: Aman Khare <85096200+amkhar@users.noreply.github.com>
Date: Wed, 25 Oct 2023 21:13:52 +0530
Subject: [PATCH 11/33] Change version to 2.12.0 for cluster state stats
 (#10915)

Signed-off-by: Aman Khare <amkhar@amazon.com>
Co-authored-by: Aman Khare <amkhar@amazon.com>
---
 .../main/java/org/opensearch/discovery/DiscoveryStats.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java b/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
index ea93ccd09ed39..fb341ac2ac569 100644
--- a/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
+++ b/server/src/main/java/org/opensearch/discovery/DiscoveryStats.java
@@ -64,7 +64,7 @@ public DiscoveryStats(PendingClusterStateStats queueStats, PublishClusterStateSt
     public DiscoveryStats(StreamInput in) throws IOException {
         queueStats = in.readOptionalWriteable(PendingClusterStateStats::new);
         publishStats = in.readOptionalWriteable(PublishClusterStateStats::new);
-        if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (in.getVersion().onOrAfter(Version.V_2_12_0)) {
             clusterStateStats = in.readOptionalWriteable(ClusterStateStats::new);
         } else {
             clusterStateStats = null;
@@ -75,7 +75,7 @@ public DiscoveryStats(StreamInput in) throws IOException {
     public void writeTo(StreamOutput out) throws IOException {
         out.writeOptionalWriteable(queueStats);
         out.writeOptionalWriteable(publishStats);
-        if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+        if (out.getVersion().onOrAfter(Version.V_2_12_0)) {
             out.writeOptionalWriteable(clusterStateStats);
         }
     }

From 6779633d76052585f5e20df5ac9388e14d75ffb2 Mon Sep 17 00:00:00 2001
From: Marc Handalian <handalm@amazon.com>
Date: Wed, 25 Oct 2023 09:58:16 -0700
Subject: [PATCH 12/33] Fix flaky test testSendCorruptBytesToReplica (#10897)

---
 .../SegmentReplicationDisruptionIT.java       | 167 ++++++++++++++++++
 .../replication/SegmentReplicationIT.java     | 136 --------------
 2 files changed, 167 insertions(+), 136 deletions(-)
 create mode 100644 server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationDisruptionIT.java

diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationDisruptionIT.java
new file mode 100644
index 0000000000000..66b26b5d25cfe
--- /dev/null
+++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationDisruptionIT.java
@@ -0,0 +1,167 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.indices.replication;
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.opensearch.action.admin.indices.recovery.RecoveryResponse;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.core.common.bytes.BytesArray;
+import org.opensearch.index.shard.IndexShard;
+import org.opensearch.indices.recovery.FileChunkRequest;
+import org.opensearch.indices.recovery.RecoveryState;
+import org.opensearch.test.OpenSearchIntegTestCase;
+import org.opensearch.test.transport.MockTransportService;
+import org.opensearch.transport.TransportRequest;
+import org.opensearch.transport.TransportService;
+import org.junit.Before;
+
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
+
+/**
+ * These tests simulate corruption cases during replication.  They are skipped on WindowsFS simulation where file renaming
+ * can fail with an access denied IOException because deletion is not permitted.
+ */
+@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
+@LuceneTestCase.SuppressFileSystems("WindowsFS")
+public class SegmentReplicationDisruptionIT extends SegmentReplicationBaseIT {
+    @Before
+    private void setup() {
+        internalCluster().startClusterManagerOnlyNode();
+    }
+
+    public void testSendCorruptBytesToReplica() throws Exception {
+        final String primaryNode = internalCluster().startDataOnlyNode();
+        createIndex(
+            INDEX_NAME,
+            Settings.builder()
+                .put(indexSettings())
+                .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
+                .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
+                .put("index.refresh_interval", -1)
+                .build()
+        );
+        ensureYellow(INDEX_NAME);
+        final String replicaNode = internalCluster().startDataOnlyNode();
+        ensureGreen(INDEX_NAME);
+
+        MockTransportService primaryTransportService = ((MockTransportService) internalCluster().getInstance(
+            TransportService.class,
+            primaryNode
+        ));
+        CountDownLatch latch = new CountDownLatch(1);
+        AtomicBoolean failed = new AtomicBoolean(false);
+        primaryTransportService.addSendBehavior(
+            internalCluster().getInstance(TransportService.class, replicaNode),
+            (connection, requestId, action, request, options) -> {
+                if (action.equals(SegmentReplicationTargetService.Actions.FILE_CHUNK) && failed.getAndSet(true) == false) {
+                    FileChunkRequest req = (FileChunkRequest) request;
+                    TransportRequest corrupt = new FileChunkRequest(
+                        req.recoveryId(),
+                        ((FileChunkRequest) request).requestSeqNo(),
+                        ((FileChunkRequest) request).shardId(),
+                        ((FileChunkRequest) request).metadata(),
+                        ((FileChunkRequest) request).position(),
+                        new BytesArray("test"),
+                        false,
+                        0,
+                        0L
+                    );
+                    connection.sendRequest(requestId, action, corrupt, options);
+                    latch.countDown();
+                } else {
+                    connection.sendRequest(requestId, action, request, options);
+                }
+            }
+        );
+        for (int i = 0; i < 100; i++) {
+            client().prepareIndex(INDEX_NAME)
+                .setId(String.valueOf(i))
+                .setSource(jsonBuilder().startObject().field("field", i).endObject())
+                .get();
+        }
+        final long originalRecoveryTime = getRecoveryStopTime(replicaNode);
+        assertNotEquals(originalRecoveryTime, 0);
+        refresh(INDEX_NAME);
+        latch.await();
+        assertTrue(failed.get());
+        waitForNewPeerRecovery(replicaNode, originalRecoveryTime);
+        // reset checkIndex to ensure our original shard doesn't throw
+        resetCheckIndexStatus();
+        waitForSearchableDocs(100, primaryNode, replicaNode);
+    }
+
+    public void testWipeSegmentBetweenSyncs() throws Exception {
+        internalCluster().startClusterManagerOnlyNode();
+        final String primaryNode = internalCluster().startDataOnlyNode();
+        createIndex(
+            INDEX_NAME,
+            Settings.builder()
+                .put(indexSettings())
+                .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
+                .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
+                .put("index.refresh_interval", -1)
+                .build()
+        );
+        ensureYellow(INDEX_NAME);
+        final String replicaNode = internalCluster().startDataOnlyNode();
+        ensureGreen(INDEX_NAME);
+
+        for (int i = 0; i < 10; i++) {
+            client().prepareIndex(INDEX_NAME)
+                .setId(String.valueOf(i))
+                .setSource(jsonBuilder().startObject().field("field", i).endObject())
+                .get();
+        }
+        refresh(INDEX_NAME);
+        ensureGreen(INDEX_NAME);
+        final long originalRecoveryTime = getRecoveryStopTime(replicaNode);
+
+        final IndexShard indexShard = getIndexShard(replicaNode, INDEX_NAME);
+        waitForSearchableDocs(INDEX_NAME, 10, List.of(replicaNode));
+        indexShard.store().directory().deleteFile("_0.si");
+
+        for (int i = 11; i < 21; i++) {
+            client().prepareIndex(INDEX_NAME)
+                .setId(String.valueOf(i))
+                .setSource(jsonBuilder().startObject().field("field", i).endObject())
+                .get();
+        }
+        refresh(INDEX_NAME);
+        waitForNewPeerRecovery(replicaNode, originalRecoveryTime);
+        resetCheckIndexStatus();
+        waitForSearchableDocs(20, primaryNode, replicaNode);
+    }
+
+    private void waitForNewPeerRecovery(String replicaNode, long originalRecoveryTime) throws Exception {
+        assertBusy(() -> {
+            // assert we have a peer recovery after the original
+            final long time = getRecoveryStopTime(replicaNode);
+            assertNotEquals(time, 0);
+            assertNotEquals(originalRecoveryTime, time);
+
+        }, 1, TimeUnit.MINUTES);
+    }
+
+    private long getRecoveryStopTime(String nodeName) {
+        final RecoveryResponse recoveryResponse = client().admin().indices().prepareRecoveries(INDEX_NAME).get();
+        final List<RecoveryState> recoveryStates = recoveryResponse.shardRecoveryStates().get(INDEX_NAME);
+        for (RecoveryState recoveryState : recoveryStates) {
+            if (recoveryState.getTargetNode().getName().equals(nodeName)) {
+                return recoveryState.getTimer().stopTime();
+            }
+        }
+        return 0L;
+    }
+}
diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java
index 81556cc270151..f48df082a25dc 100644
--- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java
@@ -24,7 +24,6 @@
 import org.apache.lucene.util.BytesRef;
 import org.opensearch.action.admin.indices.alias.Alias;
 import org.opensearch.action.admin.indices.flush.FlushRequest;
-import org.opensearch.action.admin.indices.recovery.RecoveryResponse;
 import org.opensearch.action.admin.indices.stats.IndicesStatsRequest;
 import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
 import org.opensearch.action.get.GetResponse;
@@ -59,7 +58,6 @@
 import org.opensearch.common.lucene.index.OpenSearchDirectoryReader;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.unit.TimeValue;
-import org.opensearch.core.common.bytes.BytesArray;
 import org.opensearch.core.common.io.stream.NamedWriteableRegistry;
 import org.opensearch.core.index.shard.ShardId;
 import org.opensearch.core.xcontent.XContentBuilder;
@@ -73,7 +71,6 @@
 import org.opensearch.index.engine.NRTReplicationReaderManager;
 import org.opensearch.index.shard.IndexShard;
 import org.opensearch.indices.recovery.FileChunkRequest;
-import org.opensearch.indices.recovery.RecoveryState;
 import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
 import org.opensearch.indices.replication.common.ReplicationType;
 import org.opensearch.node.NodeClosedException;
@@ -85,7 +82,6 @@
 import org.opensearch.test.InternalTestCluster;
 import org.opensearch.test.OpenSearchIntegTestCase;
 import org.opensearch.test.transport.MockTransportService;
-import org.opensearch.transport.TransportRequest;
 import org.opensearch.transport.TransportService;
 import org.junit.Before;
 
@@ -98,7 +94,6 @@
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.stream.Collectors;
 
 import static java.util.Arrays.asList;
@@ -1781,135 +1776,4 @@ public void testRealtimeTermVectorRequestsUnSuccessful() throws IOException {
         assertThat(response.getIndex(), equalTo(INDEX_NAME));
 
     }
-
-    public void testSendCorruptBytesToReplica() throws Exception {
-        // this test stubs transport calls specific to node-node replication.
-        assumeFalse(
-            "Skipping the test as its not compatible with segment replication with remote store.",
-            segmentReplicationWithRemoteEnabled()
-        );
-        final String primaryNode = internalCluster().startDataOnlyNode();
-        createIndex(
-            INDEX_NAME,
-            Settings.builder()
-                .put(indexSettings())
-                .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
-                .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
-                .put("index.refresh_interval", -1)
-                .build()
-        );
-        ensureYellow(INDEX_NAME);
-        final String replicaNode = internalCluster().startDataOnlyNode();
-        ensureGreen(INDEX_NAME);
-
-        MockTransportService primaryTransportService = ((MockTransportService) internalCluster().getInstance(
-            TransportService.class,
-            primaryNode
-        ));
-        CountDownLatch latch = new CountDownLatch(1);
-        AtomicBoolean failed = new AtomicBoolean(false);
-        primaryTransportService.addSendBehavior(
-            internalCluster().getInstance(TransportService.class, replicaNode),
-            (connection, requestId, action, request, options) -> {
-                if (action.equals(SegmentReplicationTargetService.Actions.FILE_CHUNK) && failed.getAndSet(true) == false) {
-                    FileChunkRequest req = (FileChunkRequest) request;
-                    logger.info("SENDING CORRUPT file chunk [{}] lastChunk: {}", req, req.lastChunk());
-                    TransportRequest corrupt = new FileChunkRequest(
-                        req.recoveryId(),
-                        ((FileChunkRequest) request).requestSeqNo(),
-                        ((FileChunkRequest) request).shardId(),
-                        ((FileChunkRequest) request).metadata(),
-                        ((FileChunkRequest) request).position(),
-                        new BytesArray("test"),
-                        false,
-                        0,
-                        0L
-                    );
-                    connection.sendRequest(requestId, action, corrupt, options);
-                    latch.countDown();
-                } else {
-                    connection.sendRequest(requestId, action, request, options);
-                }
-            }
-        );
-        for (int i = 0; i < 100; i++) {
-            client().prepareIndex(INDEX_NAME)
-                .setId(String.valueOf(i))
-                .setSource(jsonBuilder().startObject().field("field", i).endObject())
-                .get();
-        }
-        final long originalRecoveryTime = getRecoveryStopTime(replicaNode);
-        assertNotEquals(originalRecoveryTime, 0);
-        refresh(INDEX_NAME);
-        latch.await();
-        assertTrue(failed.get());
-        waitForNewPeerRecovery(replicaNode, originalRecoveryTime);
-        // reset checkIndex to ensure our original shard doesn't throw
-        resetCheckIndexStatus();
-        waitForSearchableDocs(100, primaryNode, replicaNode);
-    }
-
-    public void testWipeSegmentBetweenSyncs() throws Exception {
-        internalCluster().startClusterManagerOnlyNode();
-        final String primaryNode = internalCluster().startDataOnlyNode();
-        createIndex(
-            INDEX_NAME,
-            Settings.builder()
-                .put(indexSettings())
-                .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
-                .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
-                .put("index.refresh_interval", -1)
-                .build()
-        );
-        ensureYellow(INDEX_NAME);
-        final String replicaNode = internalCluster().startDataOnlyNode();
-        ensureGreen(INDEX_NAME);
-
-        for (int i = 0; i < 10; i++) {
-            client().prepareIndex(INDEX_NAME)
-                .setId(String.valueOf(i))
-                .setSource(jsonBuilder().startObject().field("field", i).endObject())
-                .get();
-        }
-        refresh(INDEX_NAME);
-        ensureGreen(INDEX_NAME);
-        final long originalRecoveryTime = getRecoveryStopTime(replicaNode);
-
-        final IndexShard indexShard = getIndexShard(replicaNode, INDEX_NAME);
-        waitForSearchableDocs(INDEX_NAME, 10, List.of(replicaNode));
-        indexShard.store().directory().deleteFile("_0.si");
-
-        for (int i = 11; i < 21; i++) {
-            client().prepareIndex(INDEX_NAME)
-                .setId(String.valueOf(i))
-                .setSource(jsonBuilder().startObject().field("field", i).endObject())
-                .get();
-        }
-        refresh(INDEX_NAME);
-        waitForNewPeerRecovery(replicaNode, originalRecoveryTime);
-        resetCheckIndexStatus();
-        waitForSearchableDocs(20, primaryNode, replicaNode);
-    }
-
-    private void waitForNewPeerRecovery(String replicaNode, long originalRecoveryTime) throws Exception {
-        assertBusy(() -> {
-            // assert we have a peer recovery after the original
-            final long time = getRecoveryStopTime(replicaNode);
-            assertNotEquals(time, 0);
-            assertNotEquals(originalRecoveryTime, time);
-
-        }, 1, TimeUnit.MINUTES);
-    }
-
-    private long getRecoveryStopTime(String nodeName) {
-        final RecoveryResponse recoveryResponse = client().admin().indices().prepareRecoveries(INDEX_NAME).get();
-        final List<RecoveryState> recoveryStates = recoveryResponse.shardRecoveryStates().get(INDEX_NAME);
-        logger.info("Recovery states {}", recoveryResponse);
-        for (RecoveryState recoveryState : recoveryStates) {
-            if (recoveryState.getTargetNode().getName().equals(nodeName)) {
-                return recoveryState.getTimer().stopTime();
-            }
-        }
-        return 0L;
-    }
 }

From 44a9f180a5643e3b4dee1150c5a68d1591270ced Mon Sep 17 00:00:00 2001
From: Aman Khare <85096200+amkhar@users.noreply.github.com>
Date: Thu, 26 Oct 2023 00:21:12 +0530
Subject: [PATCH 13/33] Fix flaky testClusterStateBatchedUpdates test (#10922)

Signed-off-by: Aman Khare <amkhar@amazon.com>
Co-authored-by: Aman Khare <amkhar@amazon.com>
---
 .../org/opensearch/cluster/service/MasterServiceTests.java  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java b/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
index 4c0ca826f5dcc..85f6c129944fa 100644
--- a/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
+++ b/server/src/test/java/org/opensearch/cluster/service/MasterServiceTests.java
@@ -487,6 +487,9 @@ public void onFailure(String source, Exception e) {
                     }
                 });
                 assertBusy(mockAppender::assertAllExpectationsMatched);
+                // verify stats values after state is published
+                assertEquals(1, clusterManagerService.getClusterStateStats().getUpdateSuccess());
+                assertEquals(0, clusterManagerService.getClusterStateStats().getUpdateFailed());
             }
         }
     }
@@ -691,9 +694,6 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
                     submittedTasksPerThread.get(entry.getKey()).get()
                 );
             }
-            // verify stats values after state is published
-            assertEquals(1, clusterManagerService.getClusterStateStats().getUpdateSuccess());
-            assertEquals(0, clusterManagerService.getClusterStateStats().getUpdateFailed());
         }
     }
 

From fb6fe1bf4518a4a4a8507564fbd4db1971f29151 Mon Sep 17 00:00:00 2001
From: Marc Handalian <handalm@amazon.com>
Date: Wed, 25 Oct 2023 14:20:48 -0700
Subject: [PATCH 14/33] Fix flaky test
 IndexShardTests.testLocalDirectoryContains (#10929)

This test is breaking for WindowsFS only. Moving it to a separate file where it is skipped on WindowsFS.

Signed-off-by: Marc Handalian <handalm@amazon.com>
---
 .../index/shard/IndexShardTests.java          | 52 -------------
 .../RemoteIndexShardCorruptionTests.java      | 75 +++++++++++++++++++
 2 files changed, 75 insertions(+), 52 deletions(-)
 create mode 100644 server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java

diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
index f5f8cd1dcfb3f..fa3cf7676f55c 100644
--- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
@@ -32,7 +32,6 @@
 package org.opensearch.index.shard;
 
 import org.apache.logging.log4j.Logger;
-import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexCommit;
@@ -46,7 +45,6 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FilterDirectory;
 import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.tests.mockfile.ExtrasFS;
 import org.apache.lucene.tests.store.BaseDirectoryWrapper;
 import org.apache.lucene.util.BytesRef;
@@ -93,7 +91,6 @@
 import org.opensearch.core.common.io.stream.StreamInput;
 import org.opensearch.core.index.shard.ShardId;
 import org.opensearch.core.indices.breaker.NoneCircuitBreakerService;
-import org.opensearch.core.util.FileSystemUtils;
 import org.opensearch.core.xcontent.MediaTypeRegistry;
 import org.opensearch.core.xcontent.NamedXContentRegistry;
 import org.opensearch.core.xcontent.XContentBuilder;
@@ -166,13 +163,11 @@
 import org.junit.Assert;
 
 import java.io.IOException;
-import java.nio.channels.FileChannel;
 import java.nio.charset.Charset;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.SimpleFileVisitor;
-import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -4912,53 +4907,6 @@ public void testRecordsForceMerges() throws IOException {
         closeShards(shard);
     }
 
-    public void testLocalDirectoryContains() throws IOException {
-        IndexShard indexShard = newStartedShard(true);
-        int numDocs = between(1, 10);
-        for (int i = 0; i < numDocs; i++) {
-            indexDoc(indexShard, "_doc", Integer.toString(i));
-        }
-        flushShard(indexShard);
-        indexShard.store().incRef();
-        Directory localDirectory = indexShard.store().directory();
-        Path shardPath = indexShard.shardPath().getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
-        Path tempDir = createTempDir();
-        for (String file : localDirectory.listAll()) {
-            if (file.equals("write.lock") || file.startsWith("extra")) {
-                continue;
-            }
-            boolean corrupted = randomBoolean();
-            long checksum = 0;
-            try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
-                checksum = CodecUtil.retrieveChecksum(indexInput);
-            }
-            if (corrupted) {
-                Files.copy(shardPath.resolve(file), tempDir.resolve(file));
-                try (FileChannel raf = FileChannel.open(shardPath.resolve(file), StandardOpenOption.READ, StandardOpenOption.WRITE)) {
-                    CorruptionUtils.corruptAt(shardPath.resolve(file), raf, (int) (raf.size() - 8));
-                }
-            }
-            if (corrupted == false) {
-                assertTrue(indexShard.localDirectoryContains(localDirectory, file, checksum));
-            } else {
-                assertFalse(indexShard.localDirectoryContains(localDirectory, file, checksum));
-                assertFalse(Files.exists(shardPath.resolve(file)));
-            }
-        }
-        try (Stream<Path> files = Files.list(tempDir)) {
-            files.forEach(p -> {
-                try {
-                    Files.copy(p, shardPath.resolve(p.getFileName()));
-                } catch (IOException e) {
-                    // Ignore
-                }
-            });
-        }
-        FileSystemUtils.deleteSubDirectories(tempDir);
-        indexShard.store().decRef();
-        closeShards(indexShard);
-    }
-
     private void populateSampleRemoteSegmentStats(RemoteSegmentTransferTracker tracker) {
         tracker.addUploadBytesStarted(30L);
         tracker.addUploadBytesSucceeded(10L);
diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java
new file mode 100644
index 0000000000000..21bf580712761
--- /dev/null
+++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java
@@ -0,0 +1,75 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.shard;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.opensearch.core.util.FileSystemUtils;
+import org.opensearch.test.CorruptionUtils;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.stream.Stream;
+
+@LuceneTestCase.SuppressFileSystems("WindowsFS")
+public class RemoteIndexShardCorruptionTests extends IndexShardTestCase {
+
+    public void testLocalDirectoryContains() throws IOException {
+        IndexShard indexShard = newStartedShard(true);
+        int numDocs = between(1, 10);
+        for (int i = 0; i < numDocs; i++) {
+            indexDoc(indexShard, "_doc", Integer.toString(i));
+        }
+        flushShard(indexShard);
+        indexShard.store().incRef();
+        Directory localDirectory = indexShard.store().directory();
+        Path shardPath = indexShard.shardPath().getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
+        Path tempDir = createTempDir();
+        for (String file : localDirectory.listAll()) {
+            if (file.equals("write.lock") || file.startsWith("extra")) {
+                continue;
+            }
+            boolean corrupted = randomBoolean();
+            long checksum = 0;
+            try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
+                checksum = CodecUtil.retrieveChecksum(indexInput);
+            }
+            if (corrupted) {
+                Files.copy(shardPath.resolve(file), tempDir.resolve(file));
+                try (FileChannel raf = FileChannel.open(shardPath.resolve(file), StandardOpenOption.READ, StandardOpenOption.WRITE)) {
+                    CorruptionUtils.corruptAt(shardPath.resolve(file), raf, (int) (raf.size() - 8));
+                }
+            }
+            if (corrupted == false) {
+                assertTrue(indexShard.localDirectoryContains(localDirectory, file, checksum));
+            } else {
+                assertFalse(indexShard.localDirectoryContains(localDirectory, file, checksum));
+                assertFalse(Files.exists(shardPath.resolve(file)));
+            }
+        }
+        try (Stream<Path> files = Files.list(tempDir)) {
+            files.forEach(p -> {
+                try {
+                    Files.copy(p, shardPath.resolve(p.getFileName()));
+                } catch (IOException e) {
+                    // Ignore
+                }
+            });
+        }
+        FileSystemUtils.deleteSubDirectories(tempDir);
+        indexShard.store().decRef();
+        closeShards(indexShard);
+    }
+}

From b17d4a8d6fa2463adbf5fc7fbaa1b1bcc4424121 Mon Sep 17 00:00:00 2001
From: Ashish <ssashish@amazon.com>
Date: Thu, 26 Oct 2023 06:20:41 +0530
Subject: [PATCH 15/33] [Remote Store] Fix refresh lag bug on primary term
 change (#10918)

* [Remote Store] Fix refresh lag bug on primary term change

Signed-off-by: Ashish Singh <ssashish@amazon.com>

* Add Integ Tests

Signed-off-by: Ashish Singh <ssashish@amazon.com>

* Incorporate PR review feedback

Signed-off-by: Ashish Singh <ssashish@amazon.com>

* Empty-Commit

Signed-off-by: Ashish Singh <ssashish@amazon.com>

---------

Signed-off-by: Ashish Singh <ssashish@amazon.com>
---
 .../remotestore/RemoteStoreStatsIT.java       | 23 ++++++++++++++++
 .../shard/RemoteStoreRefreshListener.java     | 27 ++++++++++++-------
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
index b1dbb0a900bc7..2d3ab135d0377 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreStatsIT.java
@@ -655,6 +655,29 @@ public void testStatsCorrectnessOnFailover() {
         logger.info("Test completed");
     }
 
+    public void testZeroLagOnCreateIndex() throws InterruptedException {
+        setup();
+        String clusterManagerNode = internalCluster().getClusterManagerName();
+
+        int numOfShards = randomIntBetween(1, 3);
+        createIndex(INDEX_NAME, remoteStoreIndexSettings(1, numOfShards));
+        ensureGreen(INDEX_NAME);
+        long currentTimeNs = System.nanoTime();
+        while (currentTimeNs == System.nanoTime()) {
+            Thread.sleep(10);
+        }
+
+        for (int i = 0; i < numOfShards; i++) {
+            RemoteStoreStatsResponse response = client(clusterManagerNode).admin()
+                .cluster()
+                .prepareRemoteStoreStats(INDEX_NAME, String.valueOf(i))
+                .get();
+            for (RemoteStoreStats remoteStoreStats : response.getRemoteStoreStats()) {
+                assertEquals(0, remoteStoreStats.getSegmentStats().refreshTimeLagMs);
+            }
+        }
+    }
+
     private void indexDocs() {
         for (int i = 0; i < randomIntBetween(5, 10); i++) {
             if (randomBoolean()) {
diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
index 3e97b07abfb5d..464adc88ae16f 100644
--- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
+++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
@@ -86,7 +86,7 @@ public final class RemoteStoreRefreshListener extends CloseableRetryableRefreshL
     private final RemoteSegmentStoreDirectory remoteDirectory;
     private final RemoteSegmentTransferTracker segmentTracker;
     private final Map<String, String> localSegmentChecksumMap;
-    private long primaryTerm;
+    private volatile long primaryTerm;
     private volatile Iterator<TimeValue> backoffDelayIterator;
     private final SegmentReplicationCheckpointPublisher checkpointPublisher;
 
@@ -126,10 +126,9 @@ protected void runAfterRefreshExactlyOnce(boolean didRefresh) {
         // We have 2 separate methods to check if sync needs to be done or not. This is required since we use the return boolean
         // from isReadyForUpload to schedule refresh retries as the index shard or the primary mode are not in complete
         // ready state.
-        if (shouldSync(didRefresh) && isReadyForUpload()) {
-            segmentTracker.updateLocalRefreshTimeAndSeqNo();
+        if (shouldSync(didRefresh, true) && isReadyForUpload()) {
             try {
-                initializeRemoteDirectoryOnTermUpdate();
+                segmentTracker.updateLocalRefreshTimeAndSeqNo();
                 try (GatedCloseable<SegmentInfos> segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) {
                     Collection<String> localSegmentsPostRefresh = segmentInfosGatedCloseable.get().files(true);
                     updateLocalSizeMapAndTracker(localSegmentsPostRefresh);
@@ -150,7 +149,7 @@ protected void runAfterRefreshExactlyOnce(boolean didRefresh) {
     @Override
     protected boolean performAfterRefreshWithPermit(boolean didRefresh) {
         boolean successful;
-        if (shouldSync(didRefresh)) {
+        if (shouldSync(didRefresh, false)) {
             successful = syncSegments();
         } else {
             successful = true;
@@ -158,10 +157,15 @@ protected boolean performAfterRefreshWithPermit(boolean didRefresh) {
         return successful;
     }
 
-    private boolean shouldSync(boolean didRefresh) {
-        return this.primaryTerm != indexShard.getOperationPrimaryTerm()
-            // If the readers change, didRefresh is always true.
-            || didRefresh
+    /**
+     * This checks if there is a sync required to remote.
+     *
+     * @param didRefresh             if the readers changed.
+     * @param skipPrimaryTermCheck consider change in primary term or not for should sync
+     * @return true if sync is needed
+     */
+    private boolean shouldSync(boolean didRefresh, boolean skipPrimaryTermCheck) {
+        boolean shouldSync = didRefresh // If the readers change, didRefresh is always true.
             // The third condition exists for uploading the zero state segments where the refresh has not changed the reader
             // reference, but it is important to upload the zero state segments so that the restore does not break.
             || remoteDirectory.getSegmentsUploadedToRemoteStore().isEmpty()
@@ -169,6 +173,10 @@ private boolean shouldSync(boolean didRefresh) {
             // we update the primary term and the same condition would not evaluate to true again in syncSegments.
             // Below check ensures that if there is commit, then that gets picked up by both 1st and 2nd shouldSync call.
             || isRefreshAfterCommitSafe();
+        if (shouldSync || skipPrimaryTermCheck) {
+            return shouldSync;
+        }
+        return this.primaryTerm != indexShard.getOperationPrimaryTerm();
     }
 
     private boolean syncSegments() {
@@ -188,6 +196,7 @@ private boolean syncSegments() {
 
         try {
             try {
+                initializeRemoteDirectoryOnTermUpdate();
                 // if a new segments_N file is present in local that is not uploaded to remote store yet, it
                 // is considered as a first refresh post commit. A cleanup of stale commit files is triggered.
                 // This is done to avoid delete post each refresh.

From 003b2cf30103282b031f4462ed53488edb60b9a4 Mon Sep 17 00:00:00 2001
From: Suraj Singh <surajrider@gmail.com>
Date: Wed, 25 Oct 2023 18:22:19 -0700
Subject: [PATCH 16/33] [Segment Replication] Handle exceptions on local file
 read during replication (#10933)

* Handle exceptions on file read

Signed-off-by: Suraj Singh <surajrider@gmail.com>

* Address review comments

Signed-off-by: Suraj Singh <surajrider@gmail.com>

---------

Signed-off-by: Suraj Singh <surajrider@gmail.com>
---
 .../replication/SegmentReplicationTarget.java |  11 +-
 .../index/shard/RemoteIndexShardTests.java    | 159 ++++++++++++++----
 2 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java
index cd6dbe8af90d9..cc71ef816e525 100644
--- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java
+++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java
@@ -232,6 +232,7 @@ private List<StoreFileMetadata> getFiles(CheckpointInfoResponse checkpointInfo)
         return missingFiles;
     }
 
+    // pkg private for tests
     private boolean validateLocalChecksum(StoreFileMetadata file) {
         try (IndexInput indexInput = indexShard.store().directory().openInput(file.name(), IOContext.DEFAULT)) {
             String checksum = Store.digestToString(CodecUtil.retrieveChecksum(indexInput));
@@ -243,7 +244,15 @@ private boolean validateLocalChecksum(StoreFileMetadata file) {
                 return false;
             }
         } catch (IOException e) {
-            throw new UncheckedIOException("Error reading " + file, e);
+            logger.warn("Error reading " + file, e);
+            // Delete file on exceptions so that it can be re-downloaded. This is safe to do as this file is local only
+            // and not referenced by reader.
+            try {
+                indexShard.store().directory().deleteFile(file.name());
+            } catch (IOException ex) {
+                throw new UncheckedIOException("Error reading " + file, e);
+            }
+            return false;
         }
     }
 
diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
index 703a7d457d5b6..2ce0bdc607189 100644
--- a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
@@ -31,18 +31,20 @@
 import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
 import org.opensearch.indices.replication.common.ReplicationFailedException;
 import org.opensearch.indices.replication.common.ReplicationType;
+import org.opensearch.test.CorruptionUtils;
 import org.hamcrest.MatcherAssert;
 import org.junit.Assert;
 
 import java.io.IOException;
+import java.nio.channels.FileChannel;
 import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.BiConsumer;
 import java.util.stream.Collectors;
 
@@ -371,37 +373,9 @@ public void testSegRepSucceedsOnPreviousCopiedFiles() throws Exception {
 
             final SegmentReplicationSourceFactory sourceFactory = mock(SegmentReplicationSourceFactory.class);
             final SegmentReplicationTargetService targetService = newTargetService(sourceFactory);
-            Runnable[] runAfterGetFiles = { () -> { throw new RuntimeException("Simulated"); }, () -> {} };
-            AtomicInteger index = new AtomicInteger(0);
-            RemoteStoreReplicationSource testRSReplicationSource = new RemoteStoreReplicationSource(replica) {
-                @Override
-                public void getCheckpointMetadata(
-                    long replicationId,
-                    ReplicationCheckpoint checkpoint,
-                    ActionListener<CheckpointInfoResponse> listener
-                ) {
-                    super.getCheckpointMetadata(replicationId, checkpoint, listener);
-                }
-
-                @Override
-                public void getSegmentFiles(
-                    long replicationId,
-                    ReplicationCheckpoint checkpoint,
-                    List<StoreFileMetadata> filesToFetch,
-                    IndexShard indexShard,
-                    BiConsumer<String, Long> fileProgressTracker,
-                    ActionListener<GetSegmentFilesResponse> listener
-                ) {
-                    super.getSegmentFiles(replicationId, checkpoint, filesToFetch, indexShard, (fileName, bytesRecovered) -> {}, listener);
-                    runAfterGetFiles[index.getAndIncrement()].run();
-                }
-
-                @Override
-                public String getDescription() {
-                    return "TestRemoteStoreReplicationSource";
-                }
-            };
-            when(sourceFactory.get(any())).thenReturn(testRSReplicationSource);
+            when(sourceFactory.get(any())).thenReturn(
+                getRemoteStoreReplicationSource(replica, () -> { throw new RuntimeException("Simulated"); })
+            );
             CountDownLatch latch = new CountDownLatch(1);
 
             // Start first round of segment replication. This should fail with simulated error but with replica having
@@ -412,6 +386,7 @@ public String getDescription() {
                 new SegmentReplicationTargetService.SegmentReplicationListener() {
                     @Override
                     public void onReplicationDone(SegmentReplicationState state) {
+                        latch.countDown();
                         Assert.fail("Replication should fail with simulated error");
                     }
 
@@ -421,9 +396,9 @@ public void onReplicationFailure(
                         ReplicationFailedException e,
                         boolean sendShardFailure
                     ) {
+                        latch.countDown();
                         assertFalse(sendShardFailure);
                         logger.error("Replication error", e);
-                        latch.countDown();
                     }
                 }
             );
@@ -439,7 +414,8 @@ public void onReplicationFailure(
             assertEquals("Files should be copied to disk", false, onDiskFiles.isEmpty());
             assertEquals(target.state().getStage(), SegmentReplicationState.Stage.GET_FILES);
 
-            // Start next round of segment replication
+            // Start next round of segment replication and not throwing exception resulting in commit on replica
+            when(sourceFactory.get(any())).thenReturn(getRemoteStoreReplicationSource(replica, () -> {}));
             CountDownLatch waitForSecondRound = new CountDownLatch(1);
             final SegmentReplicationTarget newTarget = targetService.startReplication(
                 replica,
@@ -456,9 +432,9 @@ public void onReplicationFailure(
                         ReplicationFailedException e,
                         boolean sendShardFailure
                     ) {
+                        waitForSecondRound.countDown();
                         logger.error("Replication error", e);
                         Assert.fail("Replication should not fail");
-                        waitForSecondRound.countDown();
                     }
                 }
             );
@@ -471,6 +447,119 @@ public void onReplicationFailure(
         }
     }
 
+    /**
+     * This test validates that local non-readable (corrupt, partially) on disk are deleted vs failing the
+     * replication event. This test mimics local files (not referenced by reader) by throwing exception post file copy and
+     * blocking update of reader. Once this is done, it corrupts one segment file and ensure that file is deleted in next
+     * round of segment replication by ensuring doc count.
+     */
+    public void testNoFailuresOnFileReads() throws Exception {
+        try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) {
+            shards.startAll();
+            IndexShard primary = shards.getPrimary();
+            final IndexShard replica = shards.getReplicas().get(0);
+
+            final int docCount = 10;
+            shards.indexDocs(docCount);
+            primary.refresh("Test");
+
+            final SegmentReplicationSourceFactory sourceFactory = mock(SegmentReplicationSourceFactory.class);
+            final SegmentReplicationTargetService targetService = newTargetService(sourceFactory);
+            when(sourceFactory.get(any())).thenReturn(
+                getRemoteStoreReplicationSource(replica, () -> { throw new RuntimeException("Simulated"); })
+            );
+            CountDownLatch waitOnReplicationCompletion = new CountDownLatch(1);
+
+            // Start first round of segment replication. This should fail with simulated error but with replica having
+            // files in its local store but not in active reader.
+            SegmentReplicationTarget segmentReplicationTarget = targetService.startReplication(
+                replica,
+                primary.getLatestReplicationCheckpoint(),
+                new SegmentReplicationTargetService.SegmentReplicationListener() {
+                    @Override
+                    public void onReplicationDone(SegmentReplicationState state) {
+                        waitOnReplicationCompletion.countDown();
+                        Assert.fail("Replication should fail with simulated error");
+                    }
+
+                    @Override
+                    public void onReplicationFailure(
+                        SegmentReplicationState state,
+                        ReplicationFailedException e,
+                        boolean sendShardFailure
+                    ) {
+                        waitOnReplicationCompletion.countDown();
+                        assertFalse(sendShardFailure);
+                    }
+                }
+            );
+            waitOnReplicationCompletion.await();
+            assertBusy(() -> { assertEquals("Target should be closed", 0, segmentReplicationTarget.refCount()); });
+            String fileToCorrupt = null;
+            // Corrupt one data file
+            Path shardPath = replica.shardPath().getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
+            for (String file : replica.store().directory().listAll()) {
+                if (file.equals("write.lock") || file.startsWith("extra") || file.startsWith("segment")) {
+                    continue;
+                }
+                fileToCorrupt = file;
+                logger.info("--> Corrupting file {}", fileToCorrupt);
+                try (FileChannel raf = FileChannel.open(shardPath.resolve(file), StandardOpenOption.READ, StandardOpenOption.WRITE)) {
+                    CorruptionUtils.corruptAt(shardPath.resolve(file), raf, (int) (raf.size() - 8));
+                }
+                break;
+            }
+            Assert.assertNotNull(fileToCorrupt);
+
+            // Ingest more data and start next round of segment replication
+            shards.indexDocs(docCount);
+            primary.refresh("Post corruption");
+            replicateSegments(primary, List.of(replica));
+
+            assertDocCount(primary, 2 * docCount);
+            assertDocCount(replica, 2 * docCount);
+
+            final Store.RecoveryDiff diff = Store.segmentReplicationDiff(primary.getSegmentMetadataMap(), replica.getSegmentMetadataMap());
+            assertTrue(diff.missing.isEmpty());
+            assertTrue(diff.different.isEmpty());
+
+            // clean up
+            shards.removeReplica(replica);
+            closeShards(replica);
+        }
+    }
+
+    private RemoteStoreReplicationSource getRemoteStoreReplicationSource(IndexShard shard, Runnable postGetFilesRunnable) {
+        return new RemoteStoreReplicationSource(shard) {
+            @Override
+            public void getCheckpointMetadata(
+                long replicationId,
+                ReplicationCheckpoint checkpoint,
+                ActionListener<CheckpointInfoResponse> listener
+            ) {
+                super.getCheckpointMetadata(replicationId, checkpoint, listener);
+            }
+
+            @Override
+            public void getSegmentFiles(
+                long replicationId,
+                ReplicationCheckpoint checkpoint,
+                List<StoreFileMetadata> filesToFetch,
+                IndexShard indexShard,
+                BiConsumer<String, Long> fileProgressTracker,
+                ActionListener<GetSegmentFilesResponse> listener
+            ) {
+                super.getSegmentFiles(replicationId, checkpoint, filesToFetch, indexShard, (fileName, bytesRecovered) -> {}, listener);
+                postGetFilesRunnable.run();
+            }
+
+            @Override
+            public String getDescription() {
+                return "TestRemoteStoreReplicationSource";
+            }
+        };
+    }
+
     @Override
     protected void validateShardIdleWithNoReplicas(IndexShard primary) {
         // ensure search idle conditions are met.

From fe8b2d545da09e3d80f74cd57a7ae991e1005abe Mon Sep 17 00:00:00 2001
From: Ashish <ssashish@amazon.com>
Date: Thu, 26 Oct 2023 07:23:18 +0530
Subject: [PATCH 17/33] [Remote Store] Fix shard failure on flush due to upload
 timeout (#10926)

---
 .../transfer/TranslogTransferManager.java     | 10 +-
 .../TranslogTransferManagerTests.java         | 94 +++++++++++++++++++
 2 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java
index ece6f6d5a534f..2f6055df87804 100644
--- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java
+++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java
@@ -42,7 +42,6 @@
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
 import java.util.stream.Collectors;
 
 import static org.opensearch.index.translog.transfer.FileSnapshot.TransferFileSnapshot;
@@ -156,14 +155,17 @@ public boolean transferSnapshot(TransferSnapshot transferSnapshot, TranslogTrans
 
             try {
                 if (latch.await(TRANSFER_TIMEOUT_IN_MILLIS, TimeUnit.MILLISECONDS) == false) {
-                    Exception ex = new TimeoutException("Timed out waiting for transfer of snapshot " + transferSnapshot + " to complete");
+                    Exception ex = new TranslogUploadFailedException(
+                        "Timed out waiting for transfer of snapshot " + transferSnapshot + " to complete"
+                    );
                     exceptionList.forEach(ex::addSuppressed);
                     throw ex;
                 }
             } catch (InterruptedException ex) {
-                exceptionList.forEach(ex::addSuppressed);
+                Exception exception = new TranslogUploadFailedException("Failed to upload " + transferSnapshot, ex);
+                exceptionList.forEach(exception::addSuppressed);
                 Thread.currentThread().interrupt();
-                throw ex;
+                throw exception;
             }
             if (exceptionList.isEmpty()) {
                 TransferFileSnapshot tlogMetadata = prepareMetadata(transferSnapshot);
diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java
index af596e7df02c2..e34bc078896f9 100644
--- a/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java
+++ b/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java
@@ -10,6 +10,7 @@
 
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.opensearch.action.LatchedActionListener;
+import org.opensearch.common.SetOnce;
 import org.opensearch.common.blobstore.BlobContainer;
 import org.opensearch.common.blobstore.BlobMetadata;
 import org.opensearch.common.blobstore.BlobPath;
@@ -35,6 +36,7 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.LinkedList;
@@ -180,6 +182,93 @@ public void onUploadFailed(TransferSnapshot transferSnapshot, Exception ex) {
         assertEquals(4, fileTransferTracker.allUploaded().size());
     }
 
+    public void testTransferSnapshotOnUploadTimeout() throws Exception {
+        doAnswer(invocationOnMock -> {
+            Thread.sleep(31 * 1000);
+            return null;
+        }).when(transferService).uploadBlobs(anySet(), anyMap(), any(ActionListener.class), any(WritePriority.class));
+        FileTransferTracker fileTransferTracker = new FileTransferTracker(
+            new ShardId("index", "indexUUid", 0),
+            remoteTranslogTransferTracker
+        );
+        TranslogTransferManager translogTransferManager = new TranslogTransferManager(
+            shardId,
+            transferService,
+            remoteBaseTransferPath,
+            fileTransferTracker,
+            remoteTranslogTransferTracker
+        );
+        SetOnce<Exception> exception = new SetOnce<>();
+        translogTransferManager.transferSnapshot(createTransferSnapshot(), new TranslogTransferListener() {
+            @Override
+            public void onUploadComplete(TransferSnapshot transferSnapshot) {}
+
+            @Override
+            public void onUploadFailed(TransferSnapshot transferSnapshot, Exception ex) {
+                exception.set(ex);
+            }
+        });
+        assertNotNull(exception.get());
+        assertTrue(exception.get() instanceof TranslogUploadFailedException);
+        assertEquals("Timed out waiting for transfer of snapshot test-to-string to complete", exception.get().getMessage());
+    }
+
+    public void testTransferSnapshotOnThreadInterrupt() throws Exception {
+        SetOnce<Thread> uploadThread = new SetOnce<>();
+        doAnswer(invocationOnMock -> {
+            uploadThread.set(new Thread(() -> {
+                ActionListener<TransferFileSnapshot> listener = invocationOnMock.getArgument(2);
+                try {
+                    Thread.sleep(31 * 1000);
+                } catch (InterruptedException ignore) {
+                    List<TransferFileSnapshot> list = new ArrayList<>(invocationOnMock.getArgument(0));
+                    listener.onFailure(new FileTransferException(list.get(0), ignore));
+                }
+            }));
+            uploadThread.get().start();
+            return null;
+        }).when(transferService).uploadBlobs(anySet(), anyMap(), any(ActionListener.class), any(WritePriority.class));
+        FileTransferTracker fileTransferTracker = new FileTransferTracker(
+            new ShardId("index", "indexUUid", 0),
+            remoteTranslogTransferTracker
+        );
+        TranslogTransferManager translogTransferManager = new TranslogTransferManager(
+            shardId,
+            transferService,
+            remoteBaseTransferPath,
+            fileTransferTracker,
+            remoteTranslogTransferTracker
+        );
+        SetOnce<Exception> exception = new SetOnce<>();
+
+        Thread thread = new Thread(() -> {
+            try {
+                translogTransferManager.transferSnapshot(createTransferSnapshot(), new TranslogTransferListener() {
+                    @Override
+                    public void onUploadComplete(TransferSnapshot transferSnapshot) {}
+
+                    @Override
+                    public void onUploadFailed(TransferSnapshot transferSnapshot, Exception ex) {
+                        exception.set(ex);
+                    }
+                });
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        });
+        thread.start();
+
+        Thread.sleep(1000);
+        // Interrupt the thread
+        thread.interrupt();
+        assertBusy(() -> {
+            assertNotNull(exception.get());
+            assertTrue(exception.get() instanceof TranslogUploadFailedException);
+            assertEquals("Failed to upload test-to-string", exception.get().getMessage());
+        });
+        uploadThread.get().interrupt();
+    }
+
     private TransferSnapshot createTransferSnapshot() {
         return new TransferSnapshot() {
             @Override
@@ -232,6 +321,11 @@ public Set<TransferFileSnapshot> getTranslogFileSnapshots() {
             public TranslogTransferMetadata getTranslogTransferMetadata() {
                 return new TranslogTransferMetadata(primaryTerm, generation, minTranslogGeneration, randomInt(5));
             }
+
+            @Override
+            public String toString() {
+                return "test-to-string";
+            }
         };
     }
 

From d1c94b524ea54f3fce3e40e211bb7108800c07de Mon Sep 17 00:00:00 2001
From: Dhwanil Patel <dhwanip@amazon.com>
Date: Thu, 26 Oct 2023 11:35:37 +0530
Subject: [PATCH 18/33] Improved logging around remote cluster state (#10892)

* Improved logging around remote cluster state

Signed-off-by: Dhwanil Patel <dhwanip@amazon.com>
---
 .../remote/RemoteClusterStateService.java     | 28 +++++++++++++------
 .../recovery/RemoteStoreRestoreService.java   |  1 +
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index 57b1b972e08c0..fa4eef79c46dc 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -244,9 +244,8 @@ public ClusterMetadataManifest writeFullMetadata(ClusterState clusterState, Stri
                 allUploadedIndexMetadata.size()
             );
         } else {
-            // todo change to debug
             logger.info(
-                "writing cluster state took [{}ms]; " + "wrote full state with [{}] indices",
+                "writing cluster state took [{}ms]; " + "wrote full state with [{}] indices and global metadata",
                 durationMillis,
                 allUploadedIndexMetadata.size()
             );
@@ -285,6 +284,7 @@ public ClusterMetadataManifest writeIncrementalMetadata(
         if (updateGlobalMetadata || previousManifest.getGlobalMetadataFileName() == null) {
             globalMetadataFile = writeGlobalMetadata(clusterState);
         } else {
+            logger.debug("Global metadata has not updated in cluster state, skipping upload of it");
             globalMetadataFile = previousManifest.getGlobalMetadataFileName();
         }
 
@@ -305,7 +305,7 @@ public ClusterMetadataManifest writeIncrementalMetadata(
         for (final IndexMetadata indexMetadata : clusterState.metadata().indices().values()) {
             final Long previousVersion = previousStateIndexMetadataVersionByName.get(indexMetadata.getIndex().getName());
             if (previousVersion == null || indexMetadata.getVersion() != previousVersion) {
-                logger.trace(
+                logger.debug(
                     "updating metadata for [{}], changing version from [{}] to [{}]",
                     indexMetadata.getIndex(),
                     previousVersion,
@@ -342,18 +342,22 @@ public ClusterMetadataManifest writeIncrementalMetadata(
         if (durationMillis >= slowWriteLoggingThreshold.getMillis()) {
             logger.warn(
                 "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
-                    + "wrote  metadata for [{}] indices and skipped [{}] unchanged indices",
+                    + "wrote  metadata for [{}] indices and skipped [{}] unchanged indices, global metadata updated : [{}]",
                 durationMillis,
                 slowWriteLoggingThreshold,
                 numIndicesUpdated,
-                numIndicesUnchanged
+                numIndicesUnchanged,
+                updateGlobalMetadata
             );
         } else {
-            logger.trace(
-                "writing cluster state took [{}ms]; " + "wrote metadata for [{}] indices and skipped [{}] unchanged indices",
+            logger.info(
+                "writing cluster state for version [{}] took [{}ms]; "
+                    + "wrote metadata for [{}] indices and skipped [{}] unchanged indices, global metadata updated : [{}]",
+                manifest.getStateVersion(),
                 durationMillis,
                 numIndicesUpdated,
-                numIndicesUnchanged
+                numIndicesUnchanged,
+                updateGlobalMetadata
             );
         }
         return manifest;
@@ -605,6 +609,11 @@ private void writeMetadataManifest(String clusterName, String clusterUUID, Clust
             blobStoreRepository.getCompressor(),
             FORMAT_PARAMS
         );
+        logger.debug(
+            "Metadata manifest file [{}] written during [{}] phase. ",
+            fileName,
+            uploadManifest.isCommitted() ? "commit" : "publish"
+        );
     }
 
     private String fetchPreviousClusterUUID(String clusterName, String clusterUUID) {
@@ -912,6 +921,7 @@ private List<String> createClusterChain(final Map<String, ClusterMetadataManifes
             // Getting the previous cluster UUID of a cluster UUID from the clusterUUID Graph
             currentUUID = clusterUUIDGraph.get(currentUUID);
         }
+        logger.info("Known UUIDs found in remote store : [{}]", validChain);
         return validChain;
     }
 
@@ -1231,7 +1241,7 @@ private void deleteClusterMetadata(
             });
 
             if (staleManifestPaths.isEmpty()) {
-                logger.info("No stale Remote Cluster Metadata files found");
+                logger.debug("No stale Remote Cluster Metadata files found");
                 return;
             }
 
diff --git a/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java b/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
index 9541d13421e27..aebd7d2ea201a 100644
--- a/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
+++ b/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
@@ -149,6 +149,7 @@ public RemoteRestoreResult restore(
                 if (currentState.metadata().clusterUUID().equals(restoreClusterUUID)) {
                     throw new IllegalArgumentException("clusterUUID to restore from should be different from current cluster UUID");
                 }
+                logger.info("Restoring cluster state from remote store from cluster UUID : [{}]", restoreClusterUUID);
                 remoteMetadata = remoteClusterStateService.getLatestMetadata(currentState.getClusterName().value(), restoreClusterUUID);
                 remoteMetadata.getIndices().values().forEach(indexMetadata -> {
                     indexMetadataMap.put(indexMetadata.getIndex().getName(), new Tuple<>(true, indexMetadata));

From 746ca09ac3ce33b27e94c871a94f0ded076e19e3 Mon Sep 17 00:00:00 2001
From: Sooraj Sinha <81695996+soosinha@users.noreply.github.com>
Date: Thu, 26 Oct 2023 11:55:13 +0530
Subject: [PATCH 19/33] Fix valid cluster UUID logic for uncommitted cluster
 UUIDs (#10916)

Signed-off-by: Sooraj Sinha <soosinha@amazon.com>
---
 .../remote/RemoteClusterStateService.java     | 36 +++++++-------
 .../RemoteClusterStateServiceTests.java       | 49 +++++++++++++++----
 2 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index fa4eef79c46dc..b3309b1fd8a63 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -881,25 +881,31 @@ private Map<String, ClusterMetadataManifest> getLatestManifestForAllClusterUUIDs
      * @return List of cluster UUIDs. The first element is the most recent cluster UUID in the chain
      */
     private List<String> createClusterChain(final Map<String, ClusterMetadataManifest> manifestsByClusterUUID, final String clusterName) {
-        final Map<String, String> clusterUUIDGraph = manifestsByClusterUUID.values()
+        final List<ClusterMetadataManifest> validClusterManifests = manifestsByClusterUUID.values()
             .stream()
+            .filter(this::isValidClusterUUID)
+            .collect(Collectors.toList());
+        final Map<String, String> clusterUUIDGraph = validClusterManifests.stream()
             .collect(Collectors.toMap(ClusterMetadataManifest::getClusterUUID, ClusterMetadataManifest::getPreviousClusterUUID));
-        final List<String> validClusterUUIDs = manifestsByClusterUUID.values()
-            .stream()
-            .filter(m -> !isInvalidClusterUUID(m) && !clusterUUIDGraph.containsValue(m.getClusterUUID()))
+        final List<String> topLevelClusterUUIDs = validClusterManifests.stream()
             .map(ClusterMetadataManifest::getClusterUUID)
+            .filter(clusterUUID -> !clusterUUIDGraph.containsValue(clusterUUID))
             .collect(Collectors.toList());
-        if (validClusterUUIDs.isEmpty()) {
-            logger.info("There is no valid previous cluster UUID");
+
+        if (topLevelClusterUUIDs.isEmpty()) {
+            // This can occur only when there are no valid cluster UUIDs
+            assert validClusterManifests.isEmpty() : "There are no top level cluster UUIDs even when there are valid cluster UUIDs";
+            logger.info("There is no valid previous cluster UUID. All cluster UUIDs evaluated are: {}", manifestsByClusterUUID.keySet());
             return Collections.emptyList();
         }
-        if (validClusterUUIDs.size() > 1) {
+        if (topLevelClusterUUIDs.size() > 1) {
+            logger.info("Top level cluster UUIDs: {}", topLevelClusterUUIDs);
             // If the valid cluster UUIDs are more that 1, it means there was some race condition where
             // more then 2 cluster manager nodes tried to become active cluster manager and published
             // 2 cluster UUIDs which followed the same previous UUID.
             final Map<String, ClusterMetadataManifest> manifestsByClusterUUIDTrimmed = trimClusterUUIDs(
                 manifestsByClusterUUID,
-                validClusterUUIDs,
+                topLevelClusterUUIDs,
                 clusterName
             );
             if (manifestsByClusterUUID.size() == manifestsByClusterUUIDTrimmed.size()) {
@@ -908,14 +914,14 @@ private List<String> createClusterChain(final Map<String, ClusterMetadataManifes
                         Locale.ROOT,
                         "The system has ended into multiple valid cluster states in the remote store. "
                             + "Please check their latest manifest to decide which one you want to keep. Valid Cluster UUIDs: - %s",
-                        validClusterUUIDs
+                        topLevelClusterUUIDs
                     )
                 );
             }
             return createClusterChain(manifestsByClusterUUIDTrimmed, clusterName);
         }
         final List<String> validChain = new ArrayList<>();
-        String currentUUID = validClusterUUIDs.get(0);
+        String currentUUID = topLevelClusterUUIDs.get(0);
         while (currentUUID != null && !ClusterState.UNKNOWN_UUID.equals(currentUUID)) {
             validChain.add(currentUUID);
             // Getting the previous cluster UUID of a cluster UUID from the clusterUUID Graph
@@ -942,11 +948,7 @@ private Map<String, ClusterMetadataManifest> trimClusterUUIDs(
             // Here we compare the manifest of current UUID to that of previous UUID
             // In case currentUUID's latest manifest is same as previous UUIDs latest manifest,
             // that means it was restored from previousUUID and no IndexMetadata update was performed on it.
-            if (ClusterState.UNKNOWN_UUID.equals(currentManifest.getPreviousClusterUUID())) {
-                if (currentManifest.getIndices().isEmpty()) {
-                    trimmedUUIDs.remove(clusterUUID);
-                }
-            } else {
+            if (!ClusterState.UNKNOWN_UUID.equals(currentManifest.getPreviousClusterUUID())) {
                 ClusterMetadataManifest previousManifest = trimmedUUIDs.get(currentManifest.getPreviousClusterUUID());
                 if (isMetadataEqual(currentManifest, previousManifest, clusterName)
                     && isGlobalMetadataEqual(currentManifest, previousManifest, clusterName)) {
@@ -985,8 +987,8 @@ private boolean isGlobalMetadataEqual(ClusterMetadataManifest first, ClusterMeta
         return Metadata.isGlobalResourcesMetadataEquals(firstGlobalMetadata, secondGlobalMetadata);
     }
 
-    private boolean isInvalidClusterUUID(ClusterMetadataManifest manifest) {
-        return !manifest.isClusterUUIDCommitted();
+    private boolean isValidClusterUUID(ClusterMetadataManifest manifest) {
+        return manifest.isClusterUUIDCommitted();
     }
 
     /**
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index ca88653f529f6..586618bd1ecff 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -911,7 +911,7 @@ public void testGetValidPreviousClusterUUIDWithMultipleChains() throws IOExcepti
             "cluster-uuid3",
             "cluster-uuid1"
         );
-        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, randomBoolean());
+        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, randomBoolean(), Collections.emptyMap());
 
         remoteClusterStateService.start();
         String previousClusterUUID = remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster");
@@ -933,6 +933,23 @@ public void testGetValidPreviousClusterUUIDWithInvalidMultipleChains() throws IO
         assertThrows(IllegalStateException.class, () -> remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster"));
     }
 
+    public void testGetValidPreviousClusterUUIDWhenLastUUIDUncommitted() throws IOException {
+        Map<String, String> clusterUUIDsPointers = Map.of(
+            "cluster-uuid1",
+            ClusterState.UNKNOWN_UUID,
+            "cluster-uuid2",
+            "cluster-uuid1",
+            "cluster-uuid3",
+            "cluster-uuid2"
+        );
+        Map<String, Boolean> clusterUUIDCommitted = Map.of("cluster-uuid1", true, "cluster-uuid2", true, "cluster-uuid3", false);
+        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, clusterUUIDCommitted);
+
+        remoteClusterStateService.start();
+        String previousClusterUUID = remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster");
+        assertThat(previousClusterUUID, equalTo("cluster-uuid2"));
+    }
+
     public void testDeleteStaleClusterUUIDs() throws IOException {
         final ClusterState clusterState = generateClusterStateWithOneIndex().nodes(nodesWithLocalNodeClusterManager()).build();
         ClusterMetadataManifest clusterMetadataManifest = ClusterMetadataManifest.builder()
@@ -1128,11 +1145,21 @@ public void testGlobalMetadataUploadWaitTimeSetting() {
     }
 
     private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> clusterUUIDsPointers) throws IOException {
-        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, false);
+        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, false, Collections.emptyMap());
     }
 
-    private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> clusterUUIDsPointers, boolean differGlobalMetadata)
-        throws IOException {
+    private void mockObjectsForGettingPreviousClusterUUID(
+        Map<String, String> clusterUUIDsPointers,
+        Map<String, Boolean> clusterUUIDCommitted
+    ) throws IOException {
+        mockObjectsForGettingPreviousClusterUUID(clusterUUIDsPointers, false, clusterUUIDCommitted);
+    }
+
+    private void mockObjectsForGettingPreviousClusterUUID(
+        Map<String, String> clusterUUIDsPointers,
+        boolean differGlobalMetadata,
+        Map<String, Boolean> clusterUUIDCommitted
+    ) throws IOException {
         final BlobPath blobPath = mock(BlobPath.class);
         when((blobStoreRepository.basePath())).thenReturn(blobPath);
         when(blobPath.add(anyString())).thenReturn(blobPath);
@@ -1155,7 +1182,8 @@ private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> cluste
             clusterUUIDsPointers.get("cluster-uuid1"),
             randomAlphaOfLength(10),
             uploadedIndexMetadataList1,
-            "test-metadata1"
+            "test-metadata1",
+            clusterUUIDCommitted.getOrDefault("cluster-uuid1", true)
         );
         Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build();
         IndexMetadata indexMetadata1 = IndexMetadata.builder("index1")
@@ -1184,7 +1212,8 @@ private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> cluste
             clusterUUIDsPointers.get("cluster-uuid2"),
             randomAlphaOfLength(10),
             uploadedIndexMetadataList2,
-            "test-metadata2"
+            "test-metadata2",
+            clusterUUIDCommitted.getOrDefault("cluster-uuid2", true)
         );
         IndexMetadata indexMetadata3 = IndexMetadata.builder("index1")
             .settings(indexSettings)
@@ -1229,7 +1258,8 @@ private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> cluste
             clusterUUIDsPointers.get("cluster-uuid3"),
             randomAlphaOfLength(10),
             uploadedIndexMetadataList3,
-            "test-metadata3"
+            "test-metadata3",
+            clusterUUIDCommitted.getOrDefault("cluster-uuid3", true)
         );
         mockBlobContainerForGlobalMetadata(blobContainer3, clusterManifest3, metadata3);
         mockBlobContainer(blobContainer3, clusterManifest3, indexMetadataMap3, ClusterMetadataManifest.CODEC_V1);
@@ -1257,7 +1287,8 @@ private ClusterMetadataManifest generateClusterMetadataManifest(
         String previousClusterUUID,
         String stateUUID,
         List<UploadedIndexMetadata> uploadedIndexMetadata,
-        String globalMetadataFileName
+        String globalMetadataFileName,
+        Boolean isUUIDCommitted
     ) {
         return ClusterMetadataManifest.builder()
             .indices(uploadedIndexMetadata)
@@ -1269,7 +1300,7 @@ private ClusterMetadataManifest generateClusterMetadataManifest(
             .opensearchVersion(VersionUtils.randomOpenSearchVersion(random()))
             .previousClusterUUID(previousClusterUUID)
             .committed(true)
-            .clusterUUIDCommitted(true)
+            .clusterUUIDCommitted(isUUIDCommitted)
             .globalMetadataFileName(globalMetadataFileName)
             .codecVersion(ClusterMetadataManifest.CODEC_V1)
             .build();

From 5ae93338e786173e39f5906ea1b2a5129f98ab66 Mon Sep 17 00:00:00 2001
From: Suraj Singh <surajrider@gmail.com>
Date: Thu, 26 Oct 2023 07:42:23 -0700
Subject: [PATCH 20/33] Mute testSegRepSucceedsOnPreviousCopiedFiles and
 testNoFailuresOnFileReads unit tests (#10942)

Signed-off-by: Suraj Singh <surajrider@gmail.com>
---
 .../java/org/opensearch/index/shard/RemoteIndexShardTests.java  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
index 2ce0bdc607189..20cec90d79e3e 100644
--- a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java
@@ -362,6 +362,7 @@ public void testPrimaryRestart() throws Exception {
      * prevent FileAlreadyExistsException. It does so by only copying files in first round of segment replication without
      * committing locally so that in next round of segment replication those files are not considered for download again
      */
+    @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885")
     public void testSegRepSucceedsOnPreviousCopiedFiles() throws Exception {
         try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) {
             shards.startAll();
@@ -453,6 +454,7 @@ public void onReplicationFailure(
      * blocking update of reader. Once this is done, it corrupts one segment file and ensure that file is deleted in next
      * round of segment replication by ensuring doc count.
      */
+    @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885")
     public void testNoFailuresOnFileReads() throws Exception {
         try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) {
             shards.startAll();

From 0c9fc21ae78babd6820479ea940a4c986d82f10f Mon Sep 17 00:00:00 2001
From: Poojita Raj <poojiraj@amazon.com>
Date: Thu, 26 Oct 2023 16:11:19 -0700
Subject: [PATCH 21/33] Add log for failover time (#10952)

Signed-off-by: Poojita Raj <poojiraj@amazon.com>
---
 .../java/org/opensearch/index/shard/IndexShard.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
index 5b6257084e440..352d4efc95269 100644
--- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java
+++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
@@ -187,6 +187,7 @@
 import org.opensearch.indices.recovery.RecoveryTarget;
 import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
 import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher;
+import org.opensearch.indices.replication.common.ReplicationTimer;
 import org.opensearch.repositories.RepositoriesService;
 import org.opensearch.repositories.Repository;
 import org.opensearch.search.suggest.completion.CompletionStats;
@@ -698,7 +699,16 @@ public void updateShardState(
                             if (indexSettings.isSegRepEnabled()) {
                                 // this Shard's engine was read only, we need to update its engine before restoring local history from xlog.
                                 assert newRouting.primary() && currentRouting.primary() == false;
+                                ReplicationTimer timer = new ReplicationTimer();
+                                timer.start();
+                                logger.debug(
+                                    "Resetting engine on promotion of shard [{}] to primary, startTime {}\n",
+                                    shardId,
+                                    timer.startTime()
+                                );
                                 resetEngineToGlobalCheckpoint();
+                                timer.stop();
+                                logger.info("Completed engine failover for shard [{}] in: {} ms", shardId, timer.time());
                                 // It is possible an engine can open with a SegmentInfos on a higher gen but the reader does not refresh to
                                 // trigger our refresh listener.
                                 // Force update the checkpoint post engine reset.

From e9affeab3494f4c2ed96a3efd647530b38a315fc Mon Sep 17 00:00:00 2001
From: rishavz_sagar <rishavsagar4b1@gmail.com>
Date: Fri, 27 Oct 2023 14:34:13 +0530
Subject: [PATCH 22/33] Fixing unreferenced file cleanup flaky tests (#10801)

Signed-off-by: RS146BIJAY <rishavsagar4b1@gmail.com>
---
 .../index/engine/InternalEngineTests.java     | 44 ++-----------------
 1 file changed, 4 insertions(+), 40 deletions(-)

diff --git a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java
index 305c3a3acbf75..81d8bccb86c60 100644
--- a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java
+++ b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java
@@ -40,7 +40,6 @@
 import org.apache.logging.log4j.core.LogEvent;
 import org.apache.logging.log4j.core.appender.AbstractAppender;
 import org.apache.logging.log4j.core.filter.RegexFilter;
-import org.apache.lucene.codecs.LiveDocsFormat;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.document.LongPoint;
@@ -3237,22 +3236,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpEnabled()
         MockDirectoryWrapper wrapper = newMockDirectory();
         final CountDownLatch cleanupCompleted = new CountDownLatch(1);
         MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() {
-            public boolean didFail1;
-            public boolean didFail2;
-
             @Override
             public void eval(MockDirectoryWrapper dir) throws IOException {
-                if (!doFail) {
-                    return;
-                }
-
-                // Fail segment merge with diskfull during merging terms.
-                if (callStackContainsAnyOf("mergeTerms") && !didFail1) {
-                    didFail1 = true;
-                    throw new IOException("No space left on device");
-                }
-                if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) {
-                    didFail2 = true;
+                // Fail segment merge with diskfull during merging terms
+                if (callStackContainsAnyOf("mergeTerms")) {
                     throw new IOException("No space left on device");
                 }
             }
@@ -3325,7 +3312,6 @@ public void onFailedEngine(String reason, Exception e) {
             segments = engine.segments(false);
             assertThat(segments.size(), equalTo(2));
 
-            fail.setDoFail();
             // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not.
             expectThrowsAnyOf(
                 Arrays.asList(IOException.class, IllegalStateException.class),
@@ -3345,20 +3331,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpDisabled(
         MockDirectoryWrapper wrapper = newMockDirectory();
         final CountDownLatch cleanupCompleted = new CountDownLatch(1);
         MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() {
-            public boolean didFail1;
-            public boolean didFail2;
 
             @Override
             public void eval(MockDirectoryWrapper dir) throws IOException {
-                if (!doFail) {
-                    return;
-                }
-                if (callStackContainsAnyOf("mergeTerms") && !didFail1) {
-                    didFail1 = true;
-                    throw new IOException("No space left on device");
-                }
-                if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) {
-                    didFail2 = true;
+                if (callStackContainsAnyOf("mergeTerms")) {
                     throw new IOException("No space left on device");
                 }
             }
@@ -3439,7 +3415,6 @@ public void onFailedEngine(String reason, Exception e) {
             segments = engine.segments(false);
             assertThat(segments.size(), equalTo(2));
 
-            fail.setDoFail();
             // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not.
             expectThrowsAnyOf(
                 Arrays.asList(IOException.class, IllegalStateException.class),
@@ -3459,20 +3434,10 @@ public void testUnreferencedFileCleanUpFailsOnSegmentMergeFailureWhenDirectoryCl
         MockDirectoryWrapper wrapper = newMockDirectory();
         final CountDownLatch cleanupCompleted = new CountDownLatch(1);
         MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() {
-            public boolean didFail1;
-            public boolean didFail2;
 
             @Override
             public void eval(MockDirectoryWrapper dir) throws IOException {
-                if (!doFail) {
-                    return;
-                }
-                if (callStackContainsAnyOf("mergeTerms") && !didFail1) {
-                    didFail1 = true;
-                    throw new IOException("No space left on device");
-                }
-                if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) {
-                    didFail2 = true;
+                if (callStackContainsAnyOf("mergeTerms")) {
                     throw new IOException("No space left on device");
                 }
             }
@@ -3537,7 +3502,6 @@ public void onFailedEngine(String reason, Exception e) {
             segments = engine.segments(false);
             assertThat(segments.size(), equalTo(2));
 
-            fail.setDoFail();
             // Close the store so that unreferenced file cleanup will fail.
             store.close();
 

From be65f543d125965d699e493933851d0cfec7e530 Mon Sep 17 00:00:00 2001
From: Ketan Verma <9292653+ketanv3@users.noreply.github.com>
Date: Fri, 27 Oct 2023 18:02:29 +0530
Subject: [PATCH 23/33] Remove deprecated classes for Rounding (#10956)

Signed-off-by: Ketan Verma <ketan9495@gmail.com>
---
 CHANGELOG.md                                  |   1 +
 .../benchmark/time/RoundingBenchmark.java     | 180 ----
 .../common/rounding/DateTimeUnit.java         |  99 ---
 .../opensearch/common/rounding/Rounding.java  | 459 ----------
 .../common/rounding/package-info.java         |  10 -
 .../org/opensearch/common/RoundingTests.java  |   3 +-
 .../common/rounding/DateTimeUnitTests.java    |  75 --
 .../common/rounding/RoundingDuelTests.java    |  70 --
 .../rounding/TimeZoneRoundingTests.java       | 822 ------------------
 9 files changed, 2 insertions(+), 1717 deletions(-)
 delete mode 100644 benchmarks/src/main/java/org/opensearch/benchmark/time/RoundingBenchmark.java
 delete mode 100644 server/src/main/java/org/opensearch/common/rounding/DateTimeUnit.java
 delete mode 100644 server/src/main/java/org/opensearch/common/rounding/Rounding.java
 delete mode 100644 server/src/main/java/org/opensearch/common/rounding/package-info.java
 delete mode 100644 server/src/test/java/org/opensearch/common/rounding/DateTimeUnitTests.java
 delete mode 100644 server/src/test/java/org/opensearch/common/rounding/RoundingDuelTests.java
 delete mode 100644 server/src/test/java/org/opensearch/common/rounding/TimeZoneRoundingTests.java

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b40878066960a..234b08398f9ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -124,6 +124,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Deprecated
 
 ### Removed
+- Remove deprecated classes for Rounding ([#10956](https://github.com/opensearch-project/OpenSearch/issues/10956))
 
 ### Fixed
 - Fix failure in dissect ingest processor parsing empty brackets ([#9225](https://github.com/opensearch-project/OpenSearch/pull/9255))
diff --git a/benchmarks/src/main/java/org/opensearch/benchmark/time/RoundingBenchmark.java b/benchmarks/src/main/java/org/opensearch/benchmark/time/RoundingBenchmark.java
deleted file mode 100644
index cdbcbfc163191..0000000000000
--- a/benchmarks/src/main/java/org/opensearch/benchmark/time/RoundingBenchmark.java
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.benchmark.time;
-
-import org.opensearch.common.Rounding;
-import org.opensearch.common.rounding.DateTimeUnit;
-import org.opensearch.common.time.DateUtils;
-import org.opensearch.common.unit.TimeValue;
-import org.joda.time.DateTimeZone;
-import org.openjdk.jmh.annotations.Benchmark;
-import org.openjdk.jmh.annotations.BenchmarkMode;
-import org.openjdk.jmh.annotations.Fork;
-import org.openjdk.jmh.annotations.Measurement;
-import org.openjdk.jmh.annotations.Mode;
-import org.openjdk.jmh.annotations.OutputTimeUnit;
-import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.State;
-import org.openjdk.jmh.annotations.Warmup;
-
-import java.time.ZoneId;
-import java.time.ZoneOffset;
-import java.util.concurrent.TimeUnit;
-
-import static org.opensearch.common.Rounding.DateTimeUnit.DAY_OF_MONTH;
-import static org.opensearch.common.Rounding.DateTimeUnit.MONTH_OF_YEAR;
-import static org.opensearch.common.Rounding.DateTimeUnit.QUARTER_OF_YEAR;
-import static org.opensearch.common.Rounding.DateTimeUnit.YEAR_OF_CENTURY;
-
-@Fork(3)
-@Warmup(iterations = 10)
-@Measurement(iterations = 10)
-@BenchmarkMode(Mode.AverageTime)
-@OutputTimeUnit(TimeUnit.NANOSECONDS)
-@State(Scope.Benchmark)
-@SuppressWarnings("unused") // invoked by benchmarking framework
-public class RoundingBenchmark {
-
-    private final ZoneId zoneId = ZoneId.of("Europe/Amsterdam");
-    private final DateTimeZone timeZone = DateUtils.zoneIdToDateTimeZone(zoneId);
-
-    private long timestamp = 1548879021354L;
-
-    private final org.opensearch.common.rounding.Rounding jodaRounding = org.opensearch.common.rounding.Rounding.builder(
-        DateTimeUnit.HOUR_OF_DAY
-    ).timeZone(timeZone).build();
-    private final Rounding javaRounding = Rounding.builder(Rounding.DateTimeUnit.HOUR_OF_DAY).timeZone(zoneId).build();
-
-    @Benchmark
-    public long timeRoundingDateTimeUnitJoda() {
-        return jodaRounding.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeRoundingDateTimeUnitJava() {
-        return javaRounding.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding jodaDayOfMonthRounding = org.opensearch.common.rounding.Rounding.builder(
-        DateTimeUnit.DAY_OF_MONTH
-    ).timeZone(timeZone).build();
-    private final Rounding javaDayOfMonthRounding = Rounding.builder(DAY_OF_MONTH).timeZone(zoneId).build();
-
-    @Benchmark
-    public long timeRoundingDateTimeUnitDayOfMonthJoda() {
-        return jodaDayOfMonthRounding.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeRoundingDateTimeUnitDayOfMonthJava() {
-        return javaDayOfMonthRounding.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding timeIntervalRoundingJoda = org.opensearch.common.rounding.Rounding.builder(
-        TimeValue.timeValueMinutes(60)
-    ).timeZone(timeZone).build();
-    private final Rounding timeIntervalRoundingJava = Rounding.builder(TimeValue.timeValueMinutes(60)).timeZone(zoneId).build();
-
-    @Benchmark
-    public long timeIntervalRoundingJava() {
-        return timeIntervalRoundingJava.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeIntervalRoundingJoda() {
-        return timeIntervalRoundingJoda.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding timeUnitRoundingUtcDayOfMonthJoda = org.opensearch.common.rounding.Rounding
-        .builder(DateTimeUnit.DAY_OF_MONTH)
-        .timeZone(DateTimeZone.UTC)
-        .build();
-    private final Rounding timeUnitRoundingUtcDayOfMonthJava = Rounding.builder(DAY_OF_MONTH).timeZone(ZoneOffset.UTC).build();
-
-    @Benchmark
-    public long timeUnitRoundingUtcDayOfMonthJava() {
-        return timeUnitRoundingUtcDayOfMonthJava.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeUnitRoundingUtcDayOfMonthJoda() {
-        return timeUnitRoundingUtcDayOfMonthJoda.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding timeUnitRoundingUtcQuarterOfYearJoda = org.opensearch.common.rounding.Rounding
-        .builder(DateTimeUnit.QUARTER)
-        .timeZone(DateTimeZone.UTC)
-        .build();
-    private final Rounding timeUnitRoundingUtcQuarterOfYearJava = Rounding.builder(QUARTER_OF_YEAR).timeZone(ZoneOffset.UTC).build();
-
-    @Benchmark
-    public long timeUnitRoundingUtcQuarterOfYearJava() {
-        return timeUnitRoundingUtcQuarterOfYearJava.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeUnitRoundingUtcQuarterOfYearJoda() {
-        return timeUnitRoundingUtcQuarterOfYearJoda.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding timeUnitRoundingUtcMonthOfYearJoda = org.opensearch.common.rounding.Rounding
-        .builder(DateTimeUnit.MONTH_OF_YEAR)
-        .timeZone(DateTimeZone.UTC)
-        .build();
-    private final Rounding timeUnitRoundingUtcMonthOfYearJava = Rounding.builder(MONTH_OF_YEAR).timeZone(ZoneOffset.UTC).build();
-
-    @Benchmark
-    public long timeUnitRoundingUtcMonthOfYearJava() {
-        return timeUnitRoundingUtcMonthOfYearJava.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeUnitRoundingUtcMonthOfYearJoda() {
-        return timeUnitRoundingUtcMonthOfYearJoda.round(timestamp);
-    }
-
-    private final org.opensearch.common.rounding.Rounding timeUnitRoundingUtcYearOfCenturyJoda = org.opensearch.common.rounding.Rounding
-        .builder(DateTimeUnit.YEAR_OF_CENTURY)
-        .timeZone(DateTimeZone.UTC)
-        .build();
-    private final Rounding timeUnitRoundingUtcYearOfCenturyJava = Rounding.builder(YEAR_OF_CENTURY).timeZone(ZoneOffset.UTC).build();
-
-    @Benchmark
-    public long timeUnitRoundingUtcYearOfCenturyJava() {
-        return timeUnitRoundingUtcYearOfCenturyJava.round(timestamp);
-    }
-
-    @Benchmark
-    public long timeUnitRoundingUtcYearOfCenturyJoda() {
-        return timeUnitRoundingUtcYearOfCenturyJoda.round(timestamp);
-    }
-}
diff --git a/server/src/main/java/org/opensearch/common/rounding/DateTimeUnit.java b/server/src/main/java/org/opensearch/common/rounding/DateTimeUnit.java
deleted file mode 100644
index 47e182b3caf84..0000000000000
--- a/server/src/main/java/org/opensearch/common/rounding/DateTimeUnit.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.common.rounding;
-
-import org.opensearch.OpenSearchException;
-import org.opensearch.common.joda.Joda;
-import org.joda.time.DateTimeField;
-import org.joda.time.DateTimeZone;
-import org.joda.time.chrono.ISOChronology;
-
-import java.util.function.Function;
-
-/**
- * Main date time unit class.
- *
- * @opensearch.internal
- */
-public enum DateTimeUnit {
-
-    WEEK_OF_WEEKYEAR((byte) 1, tz -> ISOChronology.getInstance(tz).weekOfWeekyear()),
-    YEAR_OF_CENTURY((byte) 2, tz -> ISOChronology.getInstance(tz).yearOfCentury()),
-    QUARTER((byte) 3, tz -> Joda.QuarterOfYear.getField(ISOChronology.getInstance(tz))),
-    MONTH_OF_YEAR((byte) 4, tz -> ISOChronology.getInstance(tz).monthOfYear()),
-    DAY_OF_MONTH((byte) 5, tz -> ISOChronology.getInstance(tz).dayOfMonth()),
-    HOUR_OF_DAY((byte) 6, tz -> ISOChronology.getInstance(tz).hourOfDay()),
-    MINUTES_OF_HOUR((byte) 7, tz -> ISOChronology.getInstance(tz).minuteOfHour()),
-    SECOND_OF_MINUTE((byte) 8, tz -> ISOChronology.getInstance(tz).secondOfMinute());
-
-    private final byte id;
-    private final Function<DateTimeZone, DateTimeField> fieldFunction;
-
-    DateTimeUnit(byte id, Function<DateTimeZone, DateTimeField> fieldFunction) {
-        this.id = id;
-        this.fieldFunction = fieldFunction;
-    }
-
-    public byte id() {
-        return id;
-    }
-
-    /**
-     * @return the {@link DateTimeField} for the provided {@link DateTimeZone} for this time unit
-     */
-    public DateTimeField field(DateTimeZone tz) {
-        return fieldFunction.apply(tz);
-    }
-
-    public static DateTimeUnit resolve(byte id) {
-        switch (id) {
-            case 1:
-                return WEEK_OF_WEEKYEAR;
-            case 2:
-                return YEAR_OF_CENTURY;
-            case 3:
-                return QUARTER;
-            case 4:
-                return MONTH_OF_YEAR;
-            case 5:
-                return DAY_OF_MONTH;
-            case 6:
-                return HOUR_OF_DAY;
-            case 7:
-                return MINUTES_OF_HOUR;
-            case 8:
-                return SECOND_OF_MINUTE;
-            default:
-                throw new OpenSearchException("Unknown date time unit id [" + id + "]");
-        }
-    }
-}
diff --git a/server/src/main/java/org/opensearch/common/rounding/Rounding.java b/server/src/main/java/org/opensearch/common/rounding/Rounding.java
deleted file mode 100644
index 41e808b64f7d9..0000000000000
--- a/server/src/main/java/org/opensearch/common/rounding/Rounding.java
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.common.rounding;
-
-import org.opensearch.OpenSearchException;
-import org.opensearch.common.unit.TimeValue;
-import org.opensearch.core.common.io.stream.StreamInput;
-import org.opensearch.core.common.io.stream.StreamOutput;
-import org.opensearch.core.common.io.stream.Writeable;
-import org.joda.time.DateTimeField;
-import org.joda.time.DateTimeZone;
-import org.joda.time.IllegalInstantException;
-
-import java.io.IOException;
-import java.util.Objects;
-
-/**
- * A strategy for rounding long values.
- * <p>
- * Use the java based Rounding class where applicable
- *
- * @opensearch.internal
- */
-@Deprecated
-public abstract class Rounding implements Writeable {
-
-    public abstract byte id();
-
-    /**
-     * Rounds the given value.
-     */
-    public abstract long round(long value);
-
-    /**
-     * Given the rounded value (which was potentially generated by {@link #round(long)}, returns the next rounding value. For example, with
-     * interval based rounding, if the interval is 3, {@code nextRoundValue(6) = 9 }.
-     *
-     * @param value The current rounding value
-     * @return      The next rounding value;
-     */
-    public abstract long nextRoundingValue(long value);
-
-    @Override
-    public abstract boolean equals(Object obj);
-
-    @Override
-    public abstract int hashCode();
-
-    public static Builder builder(DateTimeUnit unit) {
-        return new Builder(unit);
-    }
-
-    public static Builder builder(TimeValue interval) {
-        return new Builder(interval);
-    }
-
-    /**
-     * Builder for rounding
-     *
-     * @opensearch.internal
-     */
-    public static class Builder {
-
-        private final DateTimeUnit unit;
-        private final long interval;
-
-        private DateTimeZone timeZone = DateTimeZone.UTC;
-
-        public Builder(DateTimeUnit unit) {
-            this.unit = unit;
-            this.interval = -1;
-        }
-
-        public Builder(TimeValue interval) {
-            this.unit = null;
-            if (interval.millis() < 1) throw new IllegalArgumentException("Zero or negative time interval not supported");
-            this.interval = interval.millis();
-        }
-
-        public Builder timeZone(DateTimeZone timeZone) {
-            if (timeZone == null) {
-                throw new IllegalArgumentException("Setting null as timezone is not supported");
-            }
-            this.timeZone = timeZone;
-            return this;
-        }
-
-        public Rounding build() {
-            Rounding timeZoneRounding;
-            if (unit != null) {
-                timeZoneRounding = new TimeUnitRounding(unit, timeZone);
-            } else {
-                timeZoneRounding = new TimeIntervalRounding(interval, timeZone);
-            }
-            return timeZoneRounding;
-        }
-    }
-
-    /**
-     * Rounding time units
-     *
-     * @opensearch.internal
-     */
-    static class TimeUnitRounding extends Rounding {
-
-        static final byte ID = 1;
-
-        private final DateTimeUnit unit;
-        private final DateTimeField field;
-        private final DateTimeZone timeZone;
-        private final boolean unitRoundsToMidnight;
-
-        TimeUnitRounding(DateTimeUnit unit, DateTimeZone timeZone) {
-            this.unit = unit;
-            this.field = unit.field(timeZone);
-            unitRoundsToMidnight = this.field.getDurationField().getUnitMillis() > 60L * 60L * 1000L;
-            this.timeZone = timeZone;
-        }
-
-        TimeUnitRounding(StreamInput in) throws IOException {
-            unit = DateTimeUnit.resolve(in.readByte());
-            timeZone = DateTimeZone.forID(in.readString());
-            field = unit.field(timeZone);
-            unitRoundsToMidnight = field.getDurationField().getUnitMillis() > 60L * 60L * 1000L;
-        }
-
-        @Override
-        public byte id() {
-            return ID;
-        }
-
-        /**
-         * @return The latest timestamp T which is strictly before utcMillis
-         * and such that timeZone.getOffset(T) != timeZone.getOffset(utcMillis).
-         * If there is no such T, returns Long.MAX_VALUE.
-         */
-        private long previousTransition(long utcMillis) {
-            final int offsetAtInputTime = timeZone.getOffset(utcMillis);
-            do {
-                // Some timezones have transitions that do not change the offset, so we have to
-                // repeatedly call previousTransition until a nontrivial transition is found.
-
-                long previousTransition = timeZone.previousTransition(utcMillis);
-                if (previousTransition == utcMillis) {
-                    // There are no earlier transitions
-                    return Long.MAX_VALUE;
-                }
-                assert previousTransition < utcMillis; // Progress was made
-                utcMillis = previousTransition;
-            } while (timeZone.getOffset(utcMillis) == offsetAtInputTime);
-
-            return utcMillis;
-        }
-
-        @Override
-        public long round(long utcMillis) {
-
-            // field.roundFloor() works as long as the offset doesn't change. It is worth getting this case out of the way first, as
-            // the calculations for fixing things near to offset changes are a little expensive and are unnecessary in the common case
-            // of working in UTC.
-            if (timeZone.isFixed()) {
-                return field.roundFloor(utcMillis);
-            }
-
-            // When rounding to hours we consider any local time of the form 'xx:00:00' as rounded, even though this gives duplicate
-            // bucket names for the times when the clocks go back. Shorter units behave similarly. However, longer units round down to
-            // midnight, and on the days where there are two midnights we would rather pick the earlier one, so that buckets are
-            // uniquely identified by the date.
-            if (unitRoundsToMidnight) {
-                final long anyLocalStartOfDay = field.roundFloor(utcMillis);
-                // `anyLocalStartOfDay` is _supposed_ to be the Unix timestamp for the start of the day in question in the current time
-                // zone. Mostly this just means "midnight", which is fine, and on days with no local midnight it's the first time that
-                // does occur on that day which is also ok. However, on days with >1 local midnight this is _one_ of the midnights, but
-                // may not be the first. Check whether this is happening, and fix it if so.
-
-                final long previousTransition = previousTransition(anyLocalStartOfDay);
-
-                if (previousTransition == Long.MAX_VALUE) {
-                    // No previous transitions, so there can't be another earlier local midnight.
-                    return anyLocalStartOfDay;
-                }
-
-                final long currentOffset = timeZone.getOffset(anyLocalStartOfDay);
-                final long previousOffset = timeZone.getOffset(previousTransition);
-                assert currentOffset != previousOffset;
-
-                // NB we only assume interference from one previous transition. It's theoretically possible to have two transitions in
-                // quick succession, both of which have a midnight in them, but this doesn't appear to happen in the TZDB so (a) it's
-                // pointless to implement and (b) it won't be tested. I recognise that this comment is tempting fate and will likely
-                // cause this very situation to occur in the near future, and eagerly look forward to fixing this using a loop over
-                // previous transitions when it happens.
-
-                final long alsoLocalStartOfDay = anyLocalStartOfDay + currentOffset - previousOffset;
-                // `alsoLocalStartOfDay` is the Unix timestamp for the start of the day in question if the previous offset were in
-                // effect.
-
-                if (alsoLocalStartOfDay <= previousTransition) {
-                    // Therefore the previous offset _is_ in effect at `alsoLocalStartOfDay`, and it's earlier than anyLocalStartOfDay,
-                    // so this is the answer to use.
-                    return alsoLocalStartOfDay;
-                } else {
-                    // The previous offset is not in effect at `alsoLocalStartOfDay`, so the current offset must be.
-                    return anyLocalStartOfDay;
-                }
-
-            } else {
-                do {
-                    long rounded = field.roundFloor(utcMillis);
-
-                    // field.roundFloor() mostly works as long as the offset hasn't changed in [rounded, utcMillis], so look at where
-                    // the offset most recently changed.
-
-                    final long previousTransition = previousTransition(utcMillis);
-
-                    if (previousTransition == Long.MAX_VALUE || previousTransition < rounded) {
-                        // The offset did not change in [rounded, utcMillis], so roundFloor() worked as expected.
-                        return rounded;
-                    }
-
-                    // The offset _did_ change in [rounded, utcMillis]. Put differently, this means that none of the times in
-                    // [previousTransition+1, utcMillis] were rounded, so the rounded time must be <= previousTransition. This means
-                    // it's sufficient to try and round previousTransition down.
-                    assert previousTransition < utcMillis;
-                    utcMillis = previousTransition;
-                } while (true);
-            }
-        }
-
-        @Override
-        public long nextRoundingValue(long utcMillis) {
-            long floor = round(utcMillis);
-            // add one unit and round to get to next rounded value
-            long next = round(field.add(floor, 1));
-            if (next == floor) {
-                // in rare case we need to add more than one unit
-                next = round(field.add(floor, 2));
-            }
-            return next;
-        }
-
-        @Override
-        public void writeTo(StreamOutput out) throws IOException {
-            out.writeByte(unit.id());
-            out.writeString(timeZone.getID());
-        }
-
-        @Override
-        public int hashCode() {
-            return Objects.hash(unit, timeZone);
-        }
-
-        @Override
-        public boolean equals(Object obj) {
-            if (obj == null) {
-                return false;
-            }
-            if (getClass() != obj.getClass()) {
-                return false;
-            }
-            TimeUnitRounding other = (TimeUnitRounding) obj;
-            return Objects.equals(unit, other.unit) && Objects.equals(timeZone, other.timeZone);
-        }
-
-        @Override
-        public String toString() {
-            return "[" + timeZone + "][" + unit + "]";
-        }
-    }
-
-    /**
-     * Rounding time intervals
-     *
-     * @opensearch.internal
-     */
-    static class TimeIntervalRounding extends Rounding {
-
-        static final byte ID = 2;
-
-        private final long interval;
-        private final DateTimeZone timeZone;
-
-        TimeIntervalRounding(long interval, DateTimeZone timeZone) {
-            if (interval < 1) throw new IllegalArgumentException("Zero or negative time interval not supported");
-            this.interval = interval;
-            this.timeZone = timeZone;
-        }
-
-        TimeIntervalRounding(StreamInput in) throws IOException {
-            interval = in.readVLong();
-            timeZone = DateTimeZone.forID(in.readString());
-        }
-
-        @Override
-        public byte id() {
-            return ID;
-        }
-
-        @Override
-        public long round(long utcMillis) {
-            long timeLocal = timeZone.convertUTCToLocal(utcMillis);
-            long rounded = roundKey(timeLocal, interval) * interval;
-            long roundedUTC;
-            if (isInDSTGap(rounded) == false) {
-                roundedUTC = timeZone.convertLocalToUTC(rounded, true, utcMillis);
-                // check if we crossed DST transition, in this case we want the
-                // last rounded value before the transition
-                long transition = timeZone.previousTransition(utcMillis);
-                if (transition != utcMillis && transition > roundedUTC) {
-                    roundedUTC = round(transition - 1);
-                }
-            } else {
-                /*
-                 * Edge case where the rounded local time is illegal and landed
-                 * in a DST gap. In this case, we choose 1ms tick after the
-                 * transition date. We don't want the transition date itself
-                 * because those dates, when rounded themselves, fall into the
-                 * previous interval. This would violate the invariant that the
-                 * rounding operation should be idempotent.
-                 */
-                roundedUTC = timeZone.previousTransition(utcMillis) + 1;
-            }
-            return roundedUTC;
-        }
-
-        private static long roundKey(long value, long interval) {
-            if (value < 0) {
-                return (value - interval + 1) / interval;
-            } else {
-                return value / interval;
-            }
-        }
-
-        /**
-         * Determine whether the local instant is a valid instant in the given
-         * time zone. The logic for this is taken from
-         * {@link DateTimeZone#convertLocalToUTC(long, boolean)} for the
-         * `strict` mode case, but instead of throwing an
-         * {@link IllegalInstantException}, which is costly, we want to return a
-         * flag indicating that the value is illegal in that time zone.
-         */
-        private boolean isInDSTGap(long instantLocal) {
-            if (timeZone.isFixed()) {
-                return false;
-            }
-            // get the offset at instantLocal (first estimate)
-            int offsetLocal = timeZone.getOffset(instantLocal);
-            // adjust instantLocal using the estimate and recalc the offset
-            int offset = timeZone.getOffset(instantLocal - offsetLocal);
-            // if the offsets differ, we must be near a DST boundary
-            if (offsetLocal != offset) {
-                // determine if we are in the DST gap
-                long nextLocal = timeZone.nextTransition(instantLocal - offsetLocal);
-                if (nextLocal == (instantLocal - offsetLocal)) {
-                    nextLocal = Long.MAX_VALUE;
-                }
-                long nextAdjusted = timeZone.nextTransition(instantLocal - offset);
-                if (nextAdjusted == (instantLocal - offset)) {
-                    nextAdjusted = Long.MAX_VALUE;
-                }
-                if (nextLocal != nextAdjusted) {
-                    // we are in the DST gap
-                    return true;
-                }
-            }
-            return false;
-        }
-
-        @Override
-        public long nextRoundingValue(long time) {
-            long timeLocal = time;
-            timeLocal = timeZone.convertUTCToLocal(time);
-            long next = timeLocal + interval;
-            return timeZone.convertLocalToUTC(next, false);
-        }
-
-        @Override
-        public void writeTo(StreamOutput out) throws IOException {
-            out.writeVLong(interval);
-            out.writeString(timeZone.getID());
-        }
-
-        @Override
-        public int hashCode() {
-            return Objects.hash(interval, timeZone);
-        }
-
-        @Override
-        public boolean equals(Object obj) {
-            if (obj == null) {
-                return false;
-            }
-            if (getClass() != obj.getClass()) {
-                return false;
-            }
-            TimeIntervalRounding other = (TimeIntervalRounding) obj;
-            return Objects.equals(interval, other.interval) && Objects.equals(timeZone, other.timeZone);
-        }
-    }
-
-    /**
-     * Rounding streams
-     *
-     * @opensearch.internal
-     */
-    public static class Streams {
-
-        public static void write(Rounding rounding, StreamOutput out) throws IOException {
-            out.writeByte(rounding.id());
-            rounding.writeTo(out);
-        }
-
-        public static Rounding read(StreamInput in) throws IOException {
-            Rounding rounding;
-            byte id = in.readByte();
-            switch (id) {
-                case TimeUnitRounding.ID:
-                    rounding = new TimeUnitRounding(in);
-                    break;
-                case TimeIntervalRounding.ID:
-                    rounding = new TimeIntervalRounding(in);
-                    break;
-                default:
-                    throw new OpenSearchException("unknown rounding id [" + id + "]");
-            }
-            return rounding;
-        }
-
-    }
-
-}
diff --git a/server/src/main/java/org/opensearch/common/rounding/package-info.java b/server/src/main/java/org/opensearch/common/rounding/package-info.java
deleted file mode 100644
index 5fa3e39c6a786..0000000000000
--- a/server/src/main/java/org/opensearch/common/rounding/package-info.java
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/** Base DateTime rounding package. */
-package org.opensearch.common.rounding;
diff --git a/server/src/test/java/org/opensearch/common/RoundingTests.java b/server/src/test/java/org/opensearch/common/RoundingTests.java
index 1a499bac3e2e8..cc71ee08abcca 100644
--- a/server/src/test/java/org/opensearch/common/RoundingTests.java
+++ b/server/src/test/java/org/opensearch/common/RoundingTests.java
@@ -33,7 +33,6 @@
 package org.opensearch.common;
 
 import org.opensearch.common.collect.Tuple;
-import org.opensearch.common.rounding.DateTimeUnit;
 import org.opensearch.common.time.DateFormatter;
 import org.opensearch.common.time.DateFormatters;
 import org.opensearch.common.unit.TimeValue;
@@ -236,7 +235,7 @@ public void testOffsetRounding() {
 
     /**
      * Randomized test on TimeUnitRounding. Test uses random
-     * {@link DateTimeUnit} and {@link ZoneId} and often (50% of the time)
+     * {@link org.opensearch.common.Rounding.DateTimeUnit} and {@link ZoneId} and often (50% of the time)
      * chooses test dates that are exactly on or close to offset changes (e.g.
      * DST) in the chosen time zone.
      * <p>
diff --git a/server/src/test/java/org/opensearch/common/rounding/DateTimeUnitTests.java b/server/src/test/java/org/opensearch/common/rounding/DateTimeUnitTests.java
deleted file mode 100644
index 7b87e136c5f38..0000000000000
--- a/server/src/test/java/org/opensearch/common/rounding/DateTimeUnitTests.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.common.rounding;
-
-import org.opensearch.test.OpenSearchTestCase;
-
-import static org.opensearch.common.rounding.DateTimeUnit.DAY_OF_MONTH;
-import static org.opensearch.common.rounding.DateTimeUnit.HOUR_OF_DAY;
-import static org.opensearch.common.rounding.DateTimeUnit.MINUTES_OF_HOUR;
-import static org.opensearch.common.rounding.DateTimeUnit.MONTH_OF_YEAR;
-import static org.opensearch.common.rounding.DateTimeUnit.QUARTER;
-import static org.opensearch.common.rounding.DateTimeUnit.SECOND_OF_MINUTE;
-import static org.opensearch.common.rounding.DateTimeUnit.WEEK_OF_WEEKYEAR;
-import static org.opensearch.common.rounding.DateTimeUnit.YEAR_OF_CENTURY;
-
-public class DateTimeUnitTests extends OpenSearchTestCase {
-
-    /**
-     * test that we don't accidentally change enum ids
-     */
-    public void testEnumIds() {
-        assertEquals(1, WEEK_OF_WEEKYEAR.id());
-        assertEquals(WEEK_OF_WEEKYEAR, DateTimeUnit.resolve((byte) 1));
-
-        assertEquals(2, YEAR_OF_CENTURY.id());
-        assertEquals(YEAR_OF_CENTURY, DateTimeUnit.resolve((byte) 2));
-
-        assertEquals(3, QUARTER.id());
-        assertEquals(QUARTER, DateTimeUnit.resolve((byte) 3));
-
-        assertEquals(4, MONTH_OF_YEAR.id());
-        assertEquals(MONTH_OF_YEAR, DateTimeUnit.resolve((byte) 4));
-
-        assertEquals(5, DAY_OF_MONTH.id());
-        assertEquals(DAY_OF_MONTH, DateTimeUnit.resolve((byte) 5));
-
-        assertEquals(6, HOUR_OF_DAY.id());
-        assertEquals(HOUR_OF_DAY, DateTimeUnit.resolve((byte) 6));
-
-        assertEquals(7, MINUTES_OF_HOUR.id());
-        assertEquals(MINUTES_OF_HOUR, DateTimeUnit.resolve((byte) 7));
-
-        assertEquals(8, SECOND_OF_MINUTE.id());
-        assertEquals(SECOND_OF_MINUTE, DateTimeUnit.resolve((byte) 8));
-    }
-}
diff --git a/server/src/test/java/org/opensearch/common/rounding/RoundingDuelTests.java b/server/src/test/java/org/opensearch/common/rounding/RoundingDuelTests.java
deleted file mode 100644
index 3088067cd1f84..0000000000000
--- a/server/src/test/java/org/opensearch/common/rounding/RoundingDuelTests.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.common.rounding;
-
-import org.opensearch.common.unit.TimeValue;
-import org.opensearch.test.OpenSearchTestCase;
-import org.joda.time.DateTimeZone;
-
-import java.time.ZoneOffset;
-
-import static org.hamcrest.Matchers.is;
-
-public class RoundingDuelTests extends OpenSearchTestCase {
-
-    // dont include nano/micro seconds as rounding would become zero then and throw an exception
-    private static final String[] ALLOWED_TIME_SUFFIXES = new String[] { "d", "h", "ms", "s", "m" };
-
-    public void testDuellingImplementations() {
-        org.opensearch.common.Rounding.DateTimeUnit randomDateTimeUnit = randomFrom(org.opensearch.common.Rounding.DateTimeUnit.values());
-        org.opensearch.common.Rounding.Prepared rounding;
-        Rounding roundingJoda;
-
-        if (randomBoolean()) {
-            rounding = org.opensearch.common.Rounding.builder(randomDateTimeUnit).timeZone(ZoneOffset.UTC).build().prepareForUnknown();
-            DateTimeUnit dateTimeUnit = DateTimeUnit.resolve(randomDateTimeUnit.getId());
-            roundingJoda = Rounding.builder(dateTimeUnit).timeZone(DateTimeZone.UTC).build();
-        } else {
-            TimeValue interval = timeValue();
-            rounding = org.opensearch.common.Rounding.builder(interval).timeZone(ZoneOffset.UTC).build().prepareForUnknown();
-            roundingJoda = Rounding.builder(interval).timeZone(DateTimeZone.UTC).build();
-        }
-
-        long roundValue = randomLong();
-        assertThat(roundingJoda.round(roundValue), is(rounding.round(roundValue)));
-    }
-
-    static TimeValue timeValue() {
-        return TimeValue.parseTimeValue(randomIntBetween(1, 1000) + randomFrom(ALLOWED_TIME_SUFFIXES), "settingName");
-    }
-}
diff --git a/server/src/test/java/org/opensearch/common/rounding/TimeZoneRoundingTests.java b/server/src/test/java/org/opensearch/common/rounding/TimeZoneRoundingTests.java
deleted file mode 100644
index d1b3adcd55f0c..0000000000000
--- a/server/src/test/java/org/opensearch/common/rounding/TimeZoneRoundingTests.java
+++ /dev/null
@@ -1,822 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-package org.opensearch.common.rounding;
-
-import org.opensearch.common.collect.Tuple;
-import org.opensearch.common.rounding.Rounding.TimeIntervalRounding;
-import org.opensearch.common.rounding.Rounding.TimeUnitRounding;
-import org.opensearch.common.unit.TimeValue;
-import org.opensearch.test.OpenSearchTestCase;
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeConstants;
-import org.joda.time.DateTimeZone;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.hamcrest.TypeSafeMatcher;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.greaterThan;
-import static org.hamcrest.Matchers.greaterThanOrEqualTo;
-import static org.hamcrest.Matchers.lessThan;
-import static org.hamcrest.Matchers.lessThanOrEqualTo;
-import static org.hamcrest.Matchers.startsWith;
-
-public class TimeZoneRoundingTests extends OpenSearchTestCase {
-
-    public void testUTCTimeUnitRounding() {
-        Rounding tzRounding = Rounding.builder(DateTimeUnit.MONTH_OF_YEAR).build();
-        DateTimeZone tz = DateTimeZone.UTC;
-        assertThat(tzRounding.round(time("2009-02-03T01:01:01")), isDate(time("2009-02-01T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-01T00:00:00.000Z")), isDate(time("2009-03-01T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.WEEK_OF_WEEKYEAR).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-09T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-01-16T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.QUARTER).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-01T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-04-01T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-10T01:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-01-09T01:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-10T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-01-10T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.YEAR_OF_CENTURY).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-01T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2013-01-01T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.MINUTES_OF_HOUR).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-10T01:01:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-01-09T00:01:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.SECOND_OF_MINUTE).build();
-        assertThat(tzRounding.round(time("2012-01-10T01:01:01")), isDate(time("2012-01-10T01:01:01.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2012-01-09T00:00:00.000Z")), isDate(time("2012-01-09T00:00:01.000Z"), tz));
-    }
-
-    public void testUTCIntervalRounding() {
-        Rounding tzRounding = Rounding.builder(TimeValue.timeValueHours(12)).build();
-        DateTimeZone tz = DateTimeZone.UTC;
-        assertThat(tzRounding.round(time("2009-02-03T01:01:01")), isDate(time("2009-02-03T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T00:00:00.000Z")), isDate(time("2009-02-03T12:00:00.000Z"), tz));
-        assertThat(tzRounding.round(time("2009-02-03T13:01:01")), isDate(time("2009-02-03T12:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T12:00:00.000Z")), isDate(time("2009-02-04T00:00:00.000Z"), tz));
-
-        tzRounding = Rounding.builder(TimeValue.timeValueHours(48)).build();
-        assertThat(tzRounding.round(time("2009-02-03T01:01:01")), isDate(time("2009-02-03T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T00:00:00.000Z")), isDate(time("2009-02-05T00:00:00.000Z"), tz));
-        assertThat(tzRounding.round(time("2009-02-05T13:01:01")), isDate(time("2009-02-05T00:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-05T00:00:00.000Z")), isDate(time("2009-02-07T00:00:00.000Z"), tz));
-    }
-
-    /**
-     * test TimeIntervalRounding, (interval &lt; 12h) with time zone shift
-     */
-    public void testTimeIntervalRounding() {
-        DateTimeZone tz = DateTimeZone.forOffsetHours(-1);
-        Rounding tzRounding = Rounding.builder(TimeValue.timeValueHours(6)).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2009-02-03T00:01:01")), isDate(time("2009-02-02T19:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-02T19:00:00.000Z")), isDate(time("2009-02-03T01:00:00.000Z"), tz));
-
-        assertThat(tzRounding.round(time("2009-02-03T13:01:01")), isDate(time("2009-02-03T13:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T13:00:00.000Z")), isDate(time("2009-02-03T19:00:00.000Z"), tz));
-    }
-
-    /**
-     * test DayIntervalRounding, (interval &gt;= 12h) with time zone shift
-     */
-    public void testDayIntervalRounding() {
-        DateTimeZone tz = DateTimeZone.forOffsetHours(-8);
-        Rounding tzRounding = Rounding.builder(TimeValue.timeValueHours(12)).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2009-02-03T00:01:01")), isDate(time("2009-02-02T20:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-02T20:00:00.000Z")), isDate(time("2009-02-03T08:00:00.000Z"), tz));
-
-        assertThat(tzRounding.round(time("2009-02-03T13:01:01")), isDate(time("2009-02-03T08:00:00.000Z"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T08:00:00.000Z")), isDate(time("2009-02-03T20:00:00.000Z"), tz));
-    }
-
-    public void testDayRounding() {
-        int timezoneOffset = -2;
-        Rounding tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).timeZone(DateTimeZone.forOffsetHours(timezoneOffset)).build();
-        assertThat(tzRounding.round(0), equalTo(0L - TimeValue.timeValueHours(24 + timezoneOffset).millis()));
-        assertThat(
-            tzRounding.nextRoundingValue(0L - TimeValue.timeValueHours(24 + timezoneOffset).millis()),
-            equalTo(TimeValue.timeValueHours(-timezoneOffset).millis())
-        );
-
-        DateTimeZone tz = DateTimeZone.forID("-08:00");
-        tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2012-04-01T04:15:30Z")), isDate(time("2012-03-31T08:00:00Z"), tz));
-
-        tzRounding = Rounding.builder(DateTimeUnit.MONTH_OF_YEAR).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2012-04-01T04:15:30Z")), equalTo(time("2012-03-01T08:00:00Z")));
-
-        // date in Feb-3rd, but still in Feb-2nd in -02:00 timezone
-        tz = DateTimeZone.forID("-02:00");
-        tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2009-02-03T01:01:01")), isDate(time("2009-02-02T02:00:00"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-02T02:00:00")), isDate(time("2009-02-03T02:00:00"), tz));
-
-        // date in Feb-3rd, also in -02:00 timezone
-        tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2009-02-03T02:01:01")), isDate(time("2009-02-03T02:00:00"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T02:00:00")), isDate(time("2009-02-04T02:00:00"), tz));
-    }
-
-    public void testTimeRounding() {
-        // hour unit
-        DateTimeZone tz = DateTimeZone.forOffsetHours(-2);
-        Rounding tzRounding = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(tz).build();
-        assertThat(tzRounding.round(0), equalTo(0L));
-        assertThat(tzRounding.nextRoundingValue(0L), equalTo(TimeValue.timeValueHours(1L).getMillis()));
-
-        assertThat(tzRounding.round(time("2009-02-03T01:01:01")), isDate(time("2009-02-03T01:00:00"), tz));
-        assertThat(tzRounding.nextRoundingValue(time("2009-02-03T01:00:00")), isDate(time("2009-02-03T02:00:00"), tz));
-    }
-
-    public void testTimeUnitRoundingDST() {
-        Rounding tzRounding;
-        // testing savings to non savings switch
-        DateTimeZone cet = DateTimeZone.forID("CET");
-        tzRounding = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(cet).build();
-        assertThat(tzRounding.round(time("2014-10-26T01:01:01", cet)), isDate(time("2014-10-26T01:00:00+02:00"), cet));
-        assertThat(tzRounding.nextRoundingValue(time("2014-10-26T01:00:00", cet)), isDate(time("2014-10-26T02:00:00+02:00"), cet));
-        assertThat(tzRounding.nextRoundingValue(time("2014-10-26T02:00:00", cet)), isDate(time("2014-10-26T02:00:00+01:00"), cet));
-
-        // testing non savings to savings switch
-        tzRounding = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(cet).build();
-        assertThat(tzRounding.round(time("2014-03-30T01:01:01", cet)), isDate(time("2014-03-30T01:00:00+01:00"), cet));
-        assertThat(tzRounding.nextRoundingValue(time("2014-03-30T01:00:00", cet)), isDate(time("2014-03-30T03:00:00", cet), cet));
-        assertThat(tzRounding.nextRoundingValue(time("2014-03-30T03:00:00", cet)), isDate(time("2014-03-30T04:00:00", cet), cet));
-
-        // testing non savings to savings switch (America/Chicago)
-        DateTimeZone chg = DateTimeZone.forID("America/Chicago");
-        Rounding tzRounding_utc = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(DateTimeZone.UTC).build();
-        assertThat(tzRounding.round(time("2014-03-09T03:01:01", chg)), isDate(time("2014-03-09T03:00:00", chg), chg));
-
-        Rounding tzRounding_chg = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(chg).build();
-        assertThat(tzRounding_chg.round(time("2014-03-09T03:01:01", chg)), isDate(time("2014-03-09T03:00:00", chg), chg));
-
-        // testing savings to non savings switch 2013 (America/Chicago)
-        assertThat(tzRounding_utc.round(time("2013-11-03T06:01:01", chg)), isDate(time("2013-11-03T06:00:00", chg), chg));
-        assertThat(tzRounding_chg.round(time("2013-11-03T06:01:01", chg)), isDate(time("2013-11-03T06:00:00", chg), chg));
-
-        // testing savings to non savings switch 2014 (America/Chicago)
-        assertThat(tzRounding_utc.round(time("2014-11-02T06:01:01", chg)), isDate(time("2014-11-02T06:00:00", chg), chg));
-        assertThat(tzRounding_chg.round(time("2014-11-02T06:01:01", chg)), isDate(time("2014-11-02T06:00:00", chg), chg));
-    }
-
-    /**
-     * Randomized test on TimeUnitRounding. Test uses random
-     * {@link DateTimeUnit} and {@link DateTimeZone} and often (50% of the time)
-     * chooses test dates that are exactly on or close to offset changes (e.g.
-     * DST) in the chosen time zone.
-     * <p>
-     * It rounds the test date down and up and performs various checks on the
-     * rounding unit interval that is defined by this. Assumptions tested are
-     * described in
-     * {@link #assertInterval(long, long, long, Rounding, DateTimeZone)}
-     */
-    public void testRoundingRandom() {
-        for (int i = 0; i < 1000; ++i) {
-            DateTimeUnit timeUnit = randomTimeUnit();
-            DateTimeZone tz = randomDateTimeZone();
-            Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-            long date = Math.abs(randomLong() % (2 * (long) 10e11)); // 1970-01-01T00:00:00Z - 2033-05-18T05:33:20.000+02:00
-            long unitMillis = timeUnit.field(tz).getDurationField().getUnitMillis();
-            if (randomBoolean()) {
-                nastyDate(date, tz, unitMillis);
-            }
-            final long roundedDate = rounding.round(date);
-            final long nextRoundingValue = rounding.nextRoundingValue(roundedDate);
-
-            assertInterval(roundedDate, date, nextRoundingValue, rounding, tz);
-
-            // check correct unit interval width for units smaller than a day, they should be fixed size except for transitions
-            if (unitMillis <= DateTimeConstants.MILLIS_PER_DAY) {
-                // if the interval defined didn't cross timezone offset transition, it should cover unitMillis width
-                if (tz.getOffset(roundedDate - 1) == tz.getOffset(nextRoundingValue + 1)) {
-                    assertThat(
-                        "unit interval width not as expected for [" + timeUnit + "], [" + tz + "] at " + new DateTime(roundedDate),
-                        nextRoundingValue - roundedDate,
-                        equalTo(unitMillis)
-                    );
-                }
-            }
-        }
-    }
-
-    /**
-     * To be even more nasty, go to a transition in the selected time zone.
-     * In one third of the cases stay there, otherwise go half a unit back or forth
-     */
-    private static long nastyDate(long initialDate, DateTimeZone timezone, long unitMillis) {
-        long date = timezone.nextTransition(initialDate);
-        if (randomBoolean()) {
-            return date + (randomLong() % unitMillis);  // positive and negative offset possible
-        } else {
-            return date;
-        }
-    }
-
-    /**
-     * test DST end with interval rounding
-     * CET: 25 October 2015, 03:00:00 clocks were turned backward 1 hour to 25 October 2015, 02:00:00 local standard time
-     */
-    public void testTimeIntervalCET_DST_End() {
-        long interval = TimeUnit.MINUTES.toMillis(20);
-        DateTimeZone tz = DateTimeZone.forID("CET");
-        Rounding rounding = new TimeIntervalRounding(interval, tz);
-
-        assertThat(rounding.round(time("2015-10-25T01:55:00+02:00")), isDate(time("2015-10-25T01:40:00+02:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T02:15:00+02:00")), isDate(time("2015-10-25T02:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T02:35:00+02:00")), isDate(time("2015-10-25T02:20:00+02:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T02:55:00+02:00")), isDate(time("2015-10-25T02:40:00+02:00"), tz));
-        // after DST shift
-        assertThat(rounding.round(time("2015-10-25T02:15:00+01:00")), isDate(time("2015-10-25T02:00:00+01:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T02:35:00+01:00")), isDate(time("2015-10-25T02:20:00+01:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T02:55:00+01:00")), isDate(time("2015-10-25T02:40:00+01:00"), tz));
-        assertThat(rounding.round(time("2015-10-25T03:15:00+01:00")), isDate(time("2015-10-25T03:00:00+01:00"), tz));
-    }
-
-    /**
-     * test DST start with interval rounding
-     * CET: 27 March 2016, 02:00:00 clocks were turned forward 1 hour to 27 March 2016, 03:00:00 local daylight time
-     */
-    public void testTimeIntervalCET_DST_Start() {
-        long interval = TimeUnit.MINUTES.toMillis(20);
-        DateTimeZone tz = DateTimeZone.forID("CET");
-        Rounding rounding = new TimeIntervalRounding(interval, tz);
-        // test DST start
-        assertThat(rounding.round(time("2016-03-27T01:55:00+01:00")), isDate(time("2016-03-27T01:40:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T02:00:00+01:00")), isDate(time("2016-03-27T03:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:15:00+02:00")), isDate(time("2016-03-27T03:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:35:00+02:00")), isDate(time("2016-03-27T03:20:00+02:00"), tz));
-    }
-
-    /**
-     * test DST start with offset not fitting interval, e.g. Asia/Kathmandu
-     * adding 15min on 1986-01-01T00:00:00 the interval from
-     * 1986-01-01T00:15:00+05:45 to 1986-01-01T00:20:00+05:45 to only be 5min
-     * long
-     */
-    public void testTimeInterval_Kathmandu_DST_Start() {
-        long interval = TimeUnit.MINUTES.toMillis(20);
-        DateTimeZone tz = DateTimeZone.forID("Asia/Kathmandu");
-        Rounding rounding = new TimeIntervalRounding(interval, tz);
-        assertThat(rounding.round(time("1985-12-31T23:55:00+05:30")), isDate(time("1985-12-31T23:40:00+05:30"), tz));
-        assertThat(rounding.round(time("1986-01-01T00:16:00+05:45")), isDate(time("1986-01-01T00:15:00+05:45"), tz));
-        assertThat(time("1986-01-01T00:15:00+05:45") - time("1985-12-31T23:40:00+05:30"), equalTo(TimeUnit.MINUTES.toMillis(20)));
-        assertThat(rounding.round(time("1986-01-01T00:26:00+05:45")), isDate(time("1986-01-01T00:20:00+05:45"), tz));
-        assertThat(time("1986-01-01T00:20:00+05:45") - time("1986-01-01T00:15:00+05:45"), equalTo(TimeUnit.MINUTES.toMillis(5)));
-        assertThat(rounding.round(time("1986-01-01T00:46:00+05:45")), isDate(time("1986-01-01T00:40:00+05:45"), tz));
-        assertThat(time("1986-01-01T00:40:00+05:45") - time("1986-01-01T00:20:00+05:45"), equalTo(TimeUnit.MINUTES.toMillis(20)));
-    }
-
-    /**
-     * Special test for intervals that don't fit evenly into rounding interval.
-     * In this case, when interval crosses DST transition point, rounding in local
-     * time can land in a DST gap which results in wrong UTC rounding values.
-     */
-    public void testIntervalRounding_NotDivisibleInteval() {
-        DateTimeZone tz = DateTimeZone.forID("CET");
-        long interval = TimeUnit.MINUTES.toMillis(14);
-        Rounding rounding = new Rounding.TimeIntervalRounding(interval, tz);
-
-        assertThat(rounding.round(time("2016-03-27T01:41:00+01:00")), isDate(time("2016-03-27T01:30:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T01:51:00+01:00")), isDate(time("2016-03-27T01:44:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T01:59:00+01:00")), isDate(time("2016-03-27T01:58:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:05:00+02:00")), isDate(time("2016-03-27T03:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:12:00+02:00")), isDate(time("2016-03-27T03:08:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:25:00+02:00")), isDate(time("2016-03-27T03:22:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T03:39:00+02:00")), isDate(time("2016-03-27T03:36:00+02:00"), tz));
-    }
-
-    /**
-     * Test for half day rounding intervals scrossing DST.
-     */
-    public void testIntervalRounding_HalfDay_DST() {
-        DateTimeZone tz = DateTimeZone.forID("CET");
-        long interval = TimeUnit.HOURS.toMillis(12);
-        Rounding rounding = new Rounding.TimeIntervalRounding(interval, tz);
-
-        assertThat(rounding.round(time("2016-03-26T01:00:00+01:00")), isDate(time("2016-03-26T00:00:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-26T13:00:00+01:00")), isDate(time("2016-03-26T12:00:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T01:00:00+01:00")), isDate(time("2016-03-27T00:00:00+01:00"), tz));
-        assertThat(rounding.round(time("2016-03-27T13:00:00+02:00")), isDate(time("2016-03-27T12:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-28T01:00:00+02:00")), isDate(time("2016-03-28T00:00:00+02:00"), tz));
-        assertThat(rounding.round(time("2016-03-28T13:00:00+02:00")), isDate(time("2016-03-28T12:00:00+02:00"), tz));
-    }
-
-    /**
-     * randomized test on {@link TimeIntervalRounding} with random interval and time zone offsets
-     */
-    public void testIntervalRoundingRandom() {
-        for (int i = 0; i < 1000; i++) {
-            TimeUnit unit = randomFrom(new TimeUnit[] { TimeUnit.MINUTES, TimeUnit.HOURS, TimeUnit.DAYS });
-            long interval = unit.toMillis(randomIntBetween(1, 365));
-            DateTimeZone tz = randomDateTimeZone();
-            Rounding rounding = new Rounding.TimeIntervalRounding(interval, tz);
-            long mainDate = Math.abs(randomLong() % (2 * (long) 10e11)); // 1970-01-01T00:00:00Z - 2033-05-18T05:33:20.000+02:00
-            if (randomBoolean()) {
-                mainDate = nastyDate(mainDate, tz, interval);
-            }
-            // check two intervals around date
-            long previousRoundedValue = Long.MIN_VALUE;
-            for (long date = mainDate - 2 * interval; date < mainDate + 2 * interval; date += interval / 2) {
-                try {
-                    final long roundedDate = rounding.round(date);
-                    final long nextRoundingValue = rounding.nextRoundingValue(roundedDate);
-                    assertThat("Rounding should be idempotent", roundedDate, equalTo(rounding.round(roundedDate)));
-                    assertThat("Rounded value smaller or equal than unrounded", roundedDate, lessThanOrEqualTo(date));
-                    assertThat(
-                        "Values smaller than rounded value should round further down",
-                        rounding.round(roundedDate - 1),
-                        lessThan(roundedDate)
-                    );
-                    assertThat("Rounding should be >= previous rounding value", roundedDate, greaterThanOrEqualTo(previousRoundedValue));
-
-                    if (tz.isFixed()) {
-                        assertThat("NextRounding value should be greater than date", nextRoundingValue, greaterThan(roundedDate));
-                        assertThat(
-                            "NextRounding value should be interval from rounded value",
-                            nextRoundingValue - roundedDate,
-                            equalTo(interval)
-                        );
-                        assertThat(
-                            "NextRounding value should be a rounded date",
-                            nextRoundingValue,
-                            equalTo(rounding.round(nextRoundingValue))
-                        );
-                    }
-                    previousRoundedValue = roundedDate;
-                } catch (AssertionError e) {
-                    logger.error("Rounding error at {}, timezone {}, interval: {},", new DateTime(date, tz), tz, interval);
-                    throw e;
-                }
-            }
-        }
-    }
-
-    /**
-     * Test that rounded values are always greater or equal to last rounded value if date is increasing.
-     * The example covers an interval around 2011-10-30T02:10:00+01:00, time zone CET, interval: 2700000ms
-     */
-    public void testIntervalRoundingMonotonic_CET() {
-        long interval = TimeUnit.MINUTES.toMillis(45);
-        DateTimeZone tz = DateTimeZone.forID("CET");
-        Rounding rounding = new Rounding.TimeIntervalRounding(interval, tz);
-        List<Tuple<String, String>> expectedDates = new ArrayList<>();
-        // first date is the date to be rounded, second the expected result
-        expectedDates.add(new Tuple<>("2011-10-30T01:40:00.000+02:00", "2011-10-30T01:30:00.000+02:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:02:30.000+02:00", "2011-10-30T01:30:00.000+02:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:25:00.000+02:00", "2011-10-30T02:15:00.000+02:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:47:30.000+02:00", "2011-10-30T02:15:00.000+02:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:10:00.000+01:00", "2011-10-30T02:15:00.000+02:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:32:30.000+01:00", "2011-10-30T02:15:00.000+01:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T02:55:00.000+01:00", "2011-10-30T02:15:00.000+01:00"));
-        expectedDates.add(new Tuple<>("2011-10-30T03:17:30.000+01:00", "2011-10-30T03:00:00.000+01:00"));
-
-        long previousDate = Long.MIN_VALUE;
-        for (Tuple<String, String> dates : expectedDates) {
-            final long roundedDate = rounding.round(time(dates.v1()));
-            assertThat(roundedDate, isDate(time(dates.v2()), tz));
-            assertThat(roundedDate, greaterThanOrEqualTo(previousDate));
-            previousDate = roundedDate;
-        }
-        // here's what this means for interval widths
-        assertEquals(TimeUnit.MINUTES.toMillis(45), time("2011-10-30T02:15:00.000+02:00") - time("2011-10-30T01:30:00.000+02:00"));
-        assertEquals(TimeUnit.MINUTES.toMillis(60), time("2011-10-30T02:15:00.000+01:00") - time("2011-10-30T02:15:00.000+02:00"));
-        assertEquals(TimeUnit.MINUTES.toMillis(45), time("2011-10-30T03:00:00.000+01:00") - time("2011-10-30T02:15:00.000+01:00"));
-    }
-
-    /**
-     * special test for DST switch from #9491
-     */
-    public void testAmbiguousHoursAfterDSTSwitch() {
-        Rounding tzRounding;
-        final DateTimeZone tz = DateTimeZone.forID("Asia/Jerusalem");
-        tzRounding = Rounding.builder(DateTimeUnit.HOUR_OF_DAY).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2014-10-26T00:30:00+03:00")), isDate(time("2014-10-26T00:00:00+03:00"), tz));
-        assertThat(tzRounding.round(time("2014-10-26T01:30:00+03:00")), isDate(time("2014-10-26T01:00:00+03:00"), tz));
-        // the utc date for "2014-10-25T03:00:00+03:00" and "2014-10-25T03:00:00+02:00" is the same, local time turns back 1h here
-        assertThat(time("2014-10-26T03:00:00+03:00"), isDate(time("2014-10-26T02:00:00+02:00"), tz));
-        assertThat(tzRounding.round(time("2014-10-26T01:30:00+02:00")), isDate(time("2014-10-26T01:00:00+02:00"), tz));
-        assertThat(tzRounding.round(time("2014-10-26T02:30:00+02:00")), isDate(time("2014-10-26T02:00:00+02:00"), tz));
-
-        // Day interval
-        tzRounding = Rounding.builder(DateTimeUnit.DAY_OF_MONTH).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2014-11-11T17:00:00", tz)), isDate(time("2014-11-11T00:00:00", tz), tz));
-        // DST on
-        assertThat(tzRounding.round(time("2014-08-11T17:00:00", tz)), isDate(time("2014-08-11T00:00:00", tz), tz));
-        // Day of switching DST on -> off
-        assertThat(tzRounding.round(time("2014-10-26T17:00:00", tz)), isDate(time("2014-10-26T00:00:00", tz), tz));
-        // Day of switching DST off -> on
-        assertThat(tzRounding.round(time("2015-03-27T17:00:00", tz)), isDate(time("2015-03-27T00:00:00", tz), tz));
-
-        // Month interval
-        tzRounding = Rounding.builder(DateTimeUnit.MONTH_OF_YEAR).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2014-11-11T17:00:00", tz)), isDate(time("2014-11-01T00:00:00", tz), tz));
-        // DST on
-        assertThat(tzRounding.round(time("2014-10-10T17:00:00", tz)), isDate(time("2014-10-01T00:00:00", tz), tz));
-
-        // Year interval
-        tzRounding = Rounding.builder(DateTimeUnit.YEAR_OF_CENTURY).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2014-11-11T17:00:00", tz)), isDate(time("2014-01-01T00:00:00", tz), tz));
-
-        // Two timestamps in same year and different timezone offset ("Double buckets" issue - #9491)
-        tzRounding = Rounding.builder(DateTimeUnit.YEAR_OF_CENTURY).timeZone(tz).build();
-        assertThat(tzRounding.round(time("2014-11-11T17:00:00", tz)), isDate(tzRounding.round(time("2014-08-11T17:00:00", tz)), tz));
-    }
-
-    /**
-     * test for #10025, strict local to UTC conversion can cause joda exceptions
-     * on DST start
-     */
-    public void testLenientConversionDST() {
-        DateTimeZone tz = DateTimeZone.forID("America/Sao_Paulo");
-        long start = time("2014-10-18T20:50:00.000", tz);
-        long end = time("2014-10-19T01:00:00.000", tz);
-        Rounding tzRounding = new Rounding.TimeUnitRounding(DateTimeUnit.MINUTES_OF_HOUR, tz);
-        Rounding dayTzRounding = new Rounding.TimeIntervalRounding(60000, tz);
-        for (long time = start; time < end; time = time + 60000) {
-            assertThat(tzRounding.nextRoundingValue(time), greaterThan(time));
-            assertThat(dayTzRounding.nextRoundingValue(time), greaterThan(time));
-        }
-    }
-
-    public void testEdgeCasesTransition() {
-        {
-            // standard +/-1 hour DST transition, CET
-            DateTimeUnit timeUnit = DateTimeUnit.HOUR_OF_DAY;
-            DateTimeZone tz = DateTimeZone.forID("CET");
-            Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-            // 29 Mar 2015 - Daylight Saving Time Started
-            // at 02:00:00 clocks were turned forward 1 hour to 03:00:00
-            assertInterval(time("2015-03-29T00:00:00.000+01:00"), time("2015-03-29T01:00:00.000+01:00"), rounding, 60, tz);
-            assertInterval(time("2015-03-29T01:00:00.000+01:00"), time("2015-03-29T03:00:00.000+02:00"), rounding, 60, tz);
-            assertInterval(time("2015-03-29T03:00:00.000+02:00"), time("2015-03-29T04:00:00.000+02:00"), rounding, 60, tz);
-
-            // 25 Oct 2015 - Daylight Saving Time Ended
-            // at 03:00:00 clocks were turned backward 1 hour to 02:00:00
-            assertInterval(time("2015-10-25T01:00:00.000+02:00"), time("2015-10-25T02:00:00.000+02:00"), rounding, 60, tz);
-            assertInterval(time("2015-10-25T02:00:00.000+02:00"), time("2015-10-25T02:00:00.000+01:00"), rounding, 60, tz);
-            assertInterval(time("2015-10-25T02:00:00.000+01:00"), time("2015-10-25T03:00:00.000+01:00"), rounding, 60, tz);
-        }
-
-        {
-            // time zone "Asia/Kathmandu"
-            // 1 Jan 1986 - Time Zone Change (IST → NPT), at 00:00:00 clocks were turned forward 00:15 minutes
-            //
-            // hour rounding is stable before 1985-12-31T23:00:00.000 and after 1986-01-01T01:00:00.000+05:45
-            // the interval between is 105 minutes long because the hour after transition starts at 00:15
-            // which is not a round value for hourly rounding
-            DateTimeUnit timeUnit = DateTimeUnit.HOUR_OF_DAY;
-            DateTimeZone tz = DateTimeZone.forID("Asia/Kathmandu");
-            Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-            assertInterval(time("1985-12-31T22:00:00.000+05:30"), time("1985-12-31T23:00:00.000+05:30"), rounding, 60, tz);
-            assertInterval(time("1985-12-31T23:00:00.000+05:30"), time("1986-01-01T01:00:00.000+05:45"), rounding, 105, tz);
-            assertInterval(time("1986-01-01T01:00:00.000+05:45"), time("1986-01-01T02:00:00.000+05:45"), rounding, 60, tz);
-        }
-
-        {
-            // time zone "Australia/Lord_Howe"
-            // 3 Mar 1991 - Daylight Saving Time Ended
-            // at 02:00:00 clocks were turned backward 0:30 hours to Sunday, 3 March 1991, 01:30:00
-            DateTimeUnit timeUnit = DateTimeUnit.HOUR_OF_DAY;
-            DateTimeZone tz = DateTimeZone.forID("Australia/Lord_Howe");
-            Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-            assertInterval(time("1991-03-03T00:00:00.000+11:00"), time("1991-03-03T01:00:00.000+11:00"), rounding, 60, tz);
-            assertInterval(time("1991-03-03T01:00:00.000+11:00"), time("1991-03-03T02:00:00.000+10:30"), rounding, 90, tz);
-            assertInterval(time("1991-03-03T02:00:00.000+10:30"), time("1991-03-03T03:00:00.000+10:30"), rounding, 60, tz);
-
-            // 27 Oct 1991 - Daylight Saving Time Started
-            // at 02:00:00 clocks were turned forward 0:30 hours to 02:30:00
-            assertInterval(time("1991-10-27T00:00:00.000+10:30"), time("1991-10-27T01:00:00.000+10:30"), rounding, 60, tz);
-            // the interval containing the switch time is 90 minutes long
-            assertInterval(time("1991-10-27T01:00:00.000+10:30"), time("1991-10-27T03:00:00.000+11:00"), rounding, 90, tz);
-            assertInterval(time("1991-10-27T03:00:00.000+11:00"), time("1991-10-27T04:00:00.000+11:00"), rounding, 60, tz);
-        }
-
-        {
-            // time zone "Pacific/Chatham"
-            // 5 Apr 2015 - Daylight Saving Time Ended
-            // at 03:45:00 clocks were turned backward 1 hour to 02:45:00
-            DateTimeUnit timeUnit = DateTimeUnit.HOUR_OF_DAY;
-            DateTimeZone tz = DateTimeZone.forID("Pacific/Chatham");
-            Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-            assertInterval(time("2015-04-05T02:00:00.000+13:45"), time("2015-04-05T03:00:00.000+13:45"), rounding, 60, tz);
-            assertInterval(time("2015-04-05T03:00:00.000+13:45"), time("2015-04-05T03:00:00.000+12:45"), rounding, 60, tz);
-            assertInterval(time("2015-04-05T03:00:00.000+12:45"), time("2015-04-05T04:00:00.000+12:45"), rounding, 60, tz);
-
-            // 27 Sep 2015 - Daylight Saving Time Started
-            // at 02:45:00 clocks were turned forward 1 hour to 03:45:00
-
-            assertInterval(time("2015-09-27T01:00:00.000+12:45"), time("2015-09-27T02:00:00.000+12:45"), rounding, 60, tz);
-            assertInterval(time("2015-09-27T02:00:00.000+12:45"), time("2015-09-27T04:00:00.000+13:45"), rounding, 60, tz);
-            assertInterval(time("2015-09-27T04:00:00.000+13:45"), time("2015-09-27T05:00:00.000+13:45"), rounding, 60, tz);
-        }
-    }
-
-    public void testDST_Europe_Rome() {
-        // time zone "Europe/Rome", rounding to days. Rome had two midnights on the day the clocks went back in 1978, and
-        // timeZone.convertLocalToUTC() gives the later of the two because Rome is east of UTC, whereas we want the earlier.
-
-        DateTimeUnit timeUnit = DateTimeUnit.DAY_OF_MONTH;
-        DateTimeZone tz = DateTimeZone.forID("Europe/Rome");
-        Rounding rounding = new TimeUnitRounding(timeUnit, tz);
-
-        {
-            long timeBeforeFirstMidnight = time("1978-09-30T23:59:00+02:00");
-            long floor = rounding.round(timeBeforeFirstMidnight);
-            assertThat(floor, isDate(time("1978-09-30T00:00:00+02:00"), tz));
-        }
-
-        {
-            long timeBetweenMidnights = time("1978-10-01T00:30:00+02:00");
-            long floor = rounding.round(timeBetweenMidnights);
-            assertThat(floor, isDate(time("1978-10-01T00:00:00+02:00"), tz));
-        }
-
-        {
-            long timeAfterSecondMidnight = time("1978-10-01T00:30:00+01:00");
-            long floor = rounding.round(timeAfterSecondMidnight);
-            assertThat(floor, isDate(time("1978-10-01T00:00:00+02:00"), tz));
-
-            long prevFloor = rounding.round(floor - 1);
-            assertThat(prevFloor, lessThan(floor));
-            assertThat(prevFloor, isDate(time("1978-09-30T00:00:00+02:00"), tz));
-        }
-    }
-
-    /**
-     * Test for a time zone whose days overlap because the clocks are set back across midnight at the end of DST.
-     */
-    public void testDST_America_St_Johns() {
-        // time zone "America/St_Johns", rounding to days.
-        DateTimeUnit timeUnit = DateTimeUnit.DAY_OF_MONTH;
-        DateTimeZone tz = DateTimeZone.forID("America/St_Johns");
-        Rounding rounding = new TimeUnitRounding(timeUnit, tz);
-
-        // 29 October 2006 - Daylight Saving Time ended, changing the UTC offset from -02:30 to -03:30.
-        // This happened at 02:31 UTC, 00:01 local time, so the clocks were set back 1 hour to 23:01 on the 28th.
-        // This means that 2006-10-29 has _two_ midnights, one in the -02:30 offset and one in the -03:30 offset.
-        // Only the first of these is considered "rounded". Moreover, the extra time between 23:01 and 23:59
-        // should be considered as part of the 28th even though it comes after midnight on the 29th.
-
-        {
-            // Times before the first midnight should be rounded up to the first midnight.
-            long timeBeforeFirstMidnight = time("2006-10-28T23:30:00.000-02:30");
-            long floor = rounding.round(timeBeforeFirstMidnight);
-            assertThat(floor, isDate(time("2006-10-28T00:00:00.000-02:30"), tz));
-            long ceiling = rounding.nextRoundingValue(timeBeforeFirstMidnight);
-            assertThat(ceiling, isDate(time("2006-10-29T00:00:00.000-02:30"), tz));
-            assertInterval(floor, timeBeforeFirstMidnight, ceiling, rounding, tz);
-        }
-
-        {
-            // Times between the two midnights which are on the later day should be rounded down to the later day's midnight.
-            long timeBetweenMidnights = time("2006-10-29T00:00:30.000-02:30");
-            // (this is halfway through the last minute before the clocks changed, in which local time was ambiguous)
-
-            long floor = rounding.round(timeBetweenMidnights);
-            assertThat(floor, isDate(time("2006-10-29T00:00:00.000-02:30"), tz));
-
-            long ceiling = rounding.nextRoundingValue(timeBetweenMidnights);
-            assertThat(ceiling, isDate(time("2006-10-30T00:00:00.000-03:30"), tz));
-
-            assertInterval(floor, timeBetweenMidnights, ceiling, rounding, tz);
-        }
-
-        {
-            // Times between the two midnights which are on the earlier day should be rounded down to the earlier day's midnight.
-            long timeBetweenMidnights = time("2006-10-28T23:30:00.000-03:30");
-            // (this is halfway through the hour after the clocks changed, in which local time was ambiguous)
-
-            long floor = rounding.round(timeBetweenMidnights);
-            assertThat(floor, isDate(time("2006-10-28T00:00:00.000-02:30"), tz));
-
-            long ceiling = rounding.nextRoundingValue(timeBetweenMidnights);
-            assertThat(ceiling, isDate(time("2006-10-29T00:00:00.000-02:30"), tz));
-
-            assertInterval(floor, timeBetweenMidnights, ceiling, rounding, tz);
-        }
-
-        {
-            // Times after the second midnight should be rounded down to the first midnight.
-            long timeAfterSecondMidnight = time("2006-10-29T06:00:00.000-03:30");
-            long floor = rounding.round(timeAfterSecondMidnight);
-            assertThat(floor, isDate(time("2006-10-29T00:00:00.000-02:30"), tz));
-            long ceiling = rounding.nextRoundingValue(timeAfterSecondMidnight);
-            assertThat(ceiling, isDate(time("2006-10-30T00:00:00.000-03:30"), tz));
-            assertInterval(floor, timeAfterSecondMidnight, ceiling, rounding, tz);
-        }
-    }
-
-    /**
-     * tests for dst transition with overlaps and day roundings.
-     */
-    public void testDST_END_Edgecases() {
-        // First case, dst happens at 1am local time, switching back one hour.
-        // We want the overlapping hour to count for the next day, making it a 25h interval
-
-        DateTimeUnit timeUnit = DateTimeUnit.DAY_OF_MONTH;
-        DateTimeZone tz = DateTimeZone.forID("Atlantic/Azores");
-        Rounding rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-        // Sunday, 29 October 2000, 01:00:00 clocks were turned backward 1 hour
-        // to Sunday, 29 October 2000, 00:00:00 local standard time instead
-        // which means there were two midnights that day.
-
-        long midnightBeforeTransition = time("2000-10-29T00:00:00", tz);
-        long midnightOfTransition = time("2000-10-29T00:00:00-01:00");
-        assertEquals(60L * 60L * 1000L, midnightOfTransition - midnightBeforeTransition);
-        long nextMidnight = time("2000-10-30T00:00:00", tz);
-
-        assertInterval(midnightBeforeTransition, nextMidnight, rounding, 25 * 60, tz);
-
-        assertThat(rounding.round(time("2000-10-29T06:00:00-01:00")), isDate(time("2000-10-29T00:00:00Z"), tz));
-
-        // Second case, dst happens at 0am local time, switching back one hour to 23pm local time.
-        // We want the overlapping hour to count for the previous day here
-
-        tz = DateTimeZone.forID("America/Lima");
-        rounding = new Rounding.TimeUnitRounding(timeUnit, tz);
-
-        // Sunday, 1 April 1990, 00:00:00 clocks were turned backward 1 hour to
-        // Saturday, 31 March 1990, 23:00:00 local standard time instead
-
-        midnightBeforeTransition = time("1990-03-31T00:00:00.000-04:00");
-        nextMidnight = time("1990-04-01T00:00:00.000-05:00");
-        assertInterval(midnightBeforeTransition, nextMidnight, rounding, 25 * 60, tz);
-
-        // make sure the next interval is 24h long again
-        long midnightAfterTransition = time("1990-04-01T00:00:00.000-05:00");
-        nextMidnight = time("1990-04-02T00:00:00.000-05:00");
-        assertInterval(midnightAfterTransition, nextMidnight, rounding, 24 * 60, tz);
-    }
-
-    /**
-     * Test that time zones are correctly parsed. There is a bug with
-     * Joda 2.9.4 (see https://github.com/JodaOrg/joda-time/issues/373)
-     */
-    public void testsTimeZoneParsing() {
-        final DateTime expected = new DateTime(2016, 11, 10, 5, 37, 59, randomDateTimeZone());
-
-        // Formatter used to print and parse the sample date.
-        // Printing the date works but parsing it back fails
-        // with Joda 2.9.4
-        DateTimeFormatter formatter = DateTimeFormat.forPattern("YYYY-MM-dd'T'HH:mm:ss " + randomFrom("ZZZ", "[ZZZ]", "'['ZZZ']'"));
-
-        String dateTimeAsString = formatter.print(expected);
-        assertThat(dateTimeAsString, startsWith("2016-11-10T05:37:59 "));
-
-        DateTime parsedDateTime = formatter.parseDateTime(dateTimeAsString);
-        assertThat(parsedDateTime.getZone(), equalTo(expected.getZone()));
-    }
-
-    private static void assertInterval(long rounded, long nextRoundingValue, Rounding rounding, int minutes, DateTimeZone tz) {
-        assertInterval(rounded, dateBetween(rounded, nextRoundingValue), nextRoundingValue, rounding, tz);
-        assertEquals(DateTimeConstants.MILLIS_PER_MINUTE * minutes, nextRoundingValue - rounded);
-    }
-
-    /**
-     * perform a number on assertions and checks on {@link TimeUnitRounding} intervals
-     * @param rounded the expected low end of the rounding interval
-     * @param unrounded a date in the interval to be checked for rounding
-     * @param nextRoundingValue the expected upper end of the rounding interval
-     * @param rounding the rounding instance
-     */
-    private static void assertInterval(long rounded, long unrounded, long nextRoundingValue, Rounding rounding, DateTimeZone tz) {
-        assertThat("rounding should be idempotent ", rounding.round(rounded), isDate(rounded, tz));
-        assertThat("rounded value smaller or equal than unrounded" + rounding, rounded, lessThanOrEqualTo(unrounded));
-        assertThat("values less than rounded should round further down" + rounding, rounding.round(rounded - 1), lessThan(rounded));
-        assertThat("nextRounding value should be a rounded date", rounding.round(nextRoundingValue), isDate(nextRoundingValue, tz));
-        assertThat(
-            "values above nextRounding should round down there",
-            rounding.round(nextRoundingValue + 1),
-            isDate(nextRoundingValue, tz)
-        );
-
-        if (isTimeWithWellDefinedRounding(tz, unrounded)) {
-            assertThat("nextRounding value should be greater than date" + rounding, nextRoundingValue, greaterThan(unrounded));
-
-            long dateBetween = dateBetween(rounded, nextRoundingValue);
-            assertThat(
-                "dateBetween [" + new DateTime(dateBetween, tz) + "] should round down to roundedDate",
-                rounding.round(dateBetween),
-                isDate(rounded, tz)
-            );
-            assertThat(
-                "dateBetween [" + new DateTime(dateBetween, tz) + "] should round up to nextRoundingValue",
-                rounding.nextRoundingValue(dateBetween),
-                isDate(nextRoundingValue, tz)
-            );
-        }
-    }
-
-    private static boolean isTimeWithWellDefinedRounding(DateTimeZone tz, long t) {
-        if (tz.getID().equals("America/St_Johns")
-            || tz.getID().equals("America/Goose_Bay")
-            || tz.getID().equals("America/Moncton")
-            || tz.getID().equals("Canada/Newfoundland")) {
-
-            // Clocks went back at 00:01 between 1987 and 2010, causing overlapping days.
-            // These timezones are otherwise uninteresting, so just skip this period.
-
-            return t <= time("1987-10-01T00:00:00Z") || t >= time("2010-12-01T00:00:00Z");
-        }
-
-        if (tz.getID().equals("Antarctica/Casey")) {
-
-            // Clocks went back 3 hours at 02:00 on 2010-03-05, causing overlapping days.
-
-            return t <= time("2010-03-03T00:00:00Z") || t >= time("2010-03-07T00:00:00Z");
-        }
-
-        return true;
-    }
-
-    private static long dateBetween(long lower, long upper) {
-        long dateBetween = randomLongBetween(lower, upper - 1);
-        assert lower <= dateBetween && dateBetween < upper;
-        return dateBetween;
-    }
-
-    private static DateTimeUnit randomTimeUnit() {
-        byte id = (byte) randomIntBetween(1, 8);
-        return DateTimeUnit.resolve(id);
-    }
-
-    private static long time(String time) {
-        return time(time, DateTimeZone.UTC);
-    }
-
-    private static long time(String time, DateTimeZone zone) {
-        return ISODateTimeFormat.dateOptionalTimeParser().withZone(zone).parseMillis(time);
-    }
-
-    private static Matcher<Long> isDate(final long expected, DateTimeZone tz) {
-        return new TypeSafeMatcher<Long>() {
-            @Override
-            public boolean matchesSafely(final Long item) {
-                return expected == item.longValue();
-            }
-
-            @Override
-            public void describeTo(Description description) {
-                description.appendText(new DateTime(expected, tz) + " [" + expected + "] ");
-            }
-
-            @Override
-            protected void describeMismatchSafely(final Long actual, final Description mismatchDescription) {
-                mismatchDescription.appendText(" was ").appendValue(new DateTime(actual, tz) + " [" + actual + "]");
-            }
-        };
-    }
-}

From fcbec24408147f9b55b71458996c1cdb6a42dfbd Mon Sep 17 00:00:00 2001
From: Siddhant Deshmukh <deshsid@amazon.com>
Date: Fri, 27 Oct 2023 12:50:35 -0700
Subject: [PATCH 24/33] Change log level to trace (#10971)

Signed-off-by: Siddhant Deshmukh <deshsid@amazon.com>
---
 .../org/opensearch/action/search/SearchQueryCategorizer.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/main/java/org/opensearch/action/search/SearchQueryCategorizer.java b/server/src/main/java/org/opensearch/action/search/SearchQueryCategorizer.java
index 9cbe2d2ffcb7d..8fe1be610f9af 100644
--- a/server/src/main/java/org/opensearch/action/search/SearchQueryCategorizer.java
+++ b/server/src/main/java/org/opensearch/action/search/SearchQueryCategorizer.java
@@ -75,7 +75,7 @@ private void logQueryShape(QueryBuilder topLevelQueryBuilder) {
         }
         QueryShapeVisitor shapeVisitor = new QueryShapeVisitor();
         topLevelQueryBuilder.visit(shapeVisitor);
-        log.debug("Query shape : {}", shapeVisitor.prettyPrintTree("  "));
+        log.trace("Query shape : {}", shapeVisitor.prettyPrintTree("  "));
     }
 
 }

From bc74731a537c461cd1d7666bc3be82fbe18e143d Mon Sep 17 00:00:00 2001
From: Neetika Singhal <neetiks@amazon.com>
Date: Fri, 27 Oct 2023 16:41:26 -0700
Subject: [PATCH 25/33] update the indexRandom function to create more segments
 and update (#10247)

IndicesRequestCacheIT, HighlighterSearcherIT to run with minimum of 2
slices

Signed-off-by: Neetika Singhal <neetiks@amazon.com>
---
 CHANGELOG.md                                  |   1 +
 .../indices/IndicesRequestCacheIT.java        |   4 +-
 .../highlight/HighlighterSearchIT.java        | 134 ++++++++++++++----
 .../test/OpenSearchIntegTestCase.java         |  51 +++++++
 .../ParameterizedOpenSearchIntegTestCase.java |   8 ++
 5 files changed, 168 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 234b08398f9ef..34fd573b295b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -95,6 +95,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Remote Store] Add repository stats for remote store([#10567](https://github.com/opensearch-project/OpenSearch/pull/10567))
 - Add search query categorizer ([#10255](https://github.com/opensearch-project/OpenSearch/pull/10255))
 - Introduce ConcurrentQueryProfiler to profile query using concurrent segment search path and support concurrency during rewrite and create weight ([10352](https://github.com/opensearch-project/OpenSearch/pull/10352))
+- Update the indexRandom function to create more segments for concurrent search tests ([10247](https://github.com/opensearch-project/OpenSearch/pull/10247))
 - [Remote cluster state] Make index and global metadata upload timeout dynamic cluster settings ([#10814](https://github.com/opensearch-project/OpenSearch/pull/10814))
 - Added cluster setting cluster.restrict.index.replication_type to restrict setting of index setting replication type ([#10866](https://github.com/opensearch-project/OpenSearch/pull/10866))
 - Add cluster state stats ([#10670](https://github.com/opensearch-project/OpenSearch/pull/10670))
diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/IndicesRequestCacheIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/IndicesRequestCacheIT.java
index 98a22717019cf..848f6eddbb0df 100644
--- a/server/src/internalClusterTest/java/org/opensearch/indices/IndicesRequestCacheIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/indices/IndicesRequestCacheIT.java
@@ -539,7 +539,7 @@ public void testCanCache() throws Exception {
         assertCacheState(client, "index", 0, 4);
     }
 
-    public void testCacheWithFilteredAlias() {
+    public void testCacheWithFilteredAlias() throws InterruptedException {
         Client client = client();
         Settings settings = Settings.builder()
             .put(IndicesRequestCache.INDEX_CACHE_REQUEST_ENABLED_SETTING.getKey(), true)
@@ -562,6 +562,8 @@ public void testCacheWithFilteredAlias() {
         OpenSearchAssertions.assertAllSuccessful(forceMergeResponse);
         refresh();
 
+        indexRandomForConcurrentSearch("index");
+
         assertCacheState(client, "index", 0, 0);
 
         SearchResponse r1 = client.prepareSearch("index")
diff --git a/server/src/internalClusterTest/java/org/opensearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/internalClusterTest/java/org/opensearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
index 42d91ac945662..f7bc5eb75ad0f 100644
--- a/server/src/internalClusterTest/java/org/opensearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -155,7 +155,7 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class, MockAnalysisPlugin.class);
     }
 
-    public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOException {
+    public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();
         mappings.startObject("properties")
@@ -177,6 +177,7 @@ public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOExceptio
             .setSource(jsonBuilder().startObject().array("tags", "foo baz", "foo baz", "foo baz", "foo bar").field("sort", 2).endObject())
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         for (BoundaryScannerType scanner : BoundaryScannerType.values()) {
             SearchResponse search = client().prepareSearch()
@@ -190,12 +191,13 @@ public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOExceptio
         }
     }
 
-    public void testHighlightingWithStoredKeyword() throws IOException {
+    public void testHighlightingWithStoredKeyword() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();
         mappings.startObject("properties").startObject("text").field("type", "keyword").field("store", true).endObject().endObject();
         mappings.endObject();
         assertAcked(prepareCreate("test").setMapping(mappings));
+        indexRandomForConcurrentSearch("test");
         client().prepareIndex("test").setId("1").setSource(jsonBuilder().startObject().field("text", "foo").endObject()).get();
         refresh();
         SearchResponse search = client().prepareSearch()
@@ -205,7 +207,7 @@ public void testHighlightingWithStoredKeyword() throws IOException {
         assertHighlight(search, 0, "text", 0, equalTo("<em>foo</em>"));
     }
 
-    public void testHighlightingWithWildcardName() throws IOException {
+    public void testHighlightingWithWildcardName() throws IOException, InterruptedException {
         // test the kibana case with * as fieldname that will try highlight all fields including meta fields
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();
@@ -221,6 +223,7 @@ public void testHighlightingWithWildcardName() throws IOException {
         assertAcked(prepareCreate("test").setMapping(mappings));
         client().prepareIndex("test").setId("1").setSource(jsonBuilder().startObject().field("text", "text").endObject()).get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         for (String type : ALL_TYPES) {
             SearchResponse search = client().prepareSearch()
                 .setQuery(constantScoreQuery(matchQuery("text", "text")))
@@ -230,7 +233,7 @@ public void testHighlightingWithWildcardName() throws IOException {
         }
     }
 
-    public void testFieldAlias() throws IOException {
+    public void testFieldAlias() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder().startObject()
             .startObject("properties")
             .startObject("text")
@@ -248,7 +251,7 @@ public void testFieldAlias() throws IOException {
 
         client().prepareIndex("test").setId("1").setSource("text", "foo").get();
         refresh();
-
+        indexRandomForConcurrentSearch("test");
         for (String type : ALL_TYPES) {
             HighlightBuilder builder = new HighlightBuilder().field(new Field("alias").highlighterType(type))
                 .requireFieldMatch(randomBoolean());
@@ -257,7 +260,7 @@ public void testFieldAlias() throws IOException {
         }
     }
 
-    public void testFieldAliasWithSourceLookup() throws IOException {
+    public void testFieldAliasWithSourceLookup() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder().startObject()
             .startObject("properties")
             .startObject("text")
@@ -276,7 +279,7 @@ public void testFieldAliasWithSourceLookup() throws IOException {
 
         client().prepareIndex("test").setId("1").setSource("text", "foo bar").get();
         refresh();
-
+        indexRandomForConcurrentSearch("test");
         for (String type : ALL_TYPES) {
             HighlightBuilder builder = new HighlightBuilder().field(new Field("alias").highlighterType(type))
                 .requireFieldMatch(randomBoolean());
@@ -285,7 +288,7 @@ public void testFieldAliasWithSourceLookup() throws IOException {
         }
     }
 
-    public void testFieldAliasWithWildcardField() throws IOException {
+    public void testFieldAliasWithWildcardField() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder().startObject()
             .startObject("properties")
             .startObject("keyword")
@@ -301,13 +304,14 @@ public void testFieldAliasWithWildcardField() throws IOException {
 
         client().prepareIndex("test").setId("1").setSource("keyword", "foo").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         HighlightBuilder builder = new HighlightBuilder().field(new Field("al*")).requireFieldMatch(false);
         SearchResponse search = client().prepareSearch().setQuery(matchQuery("alias", "foo")).highlighter(builder).get();
         assertHighlight(search, 0, "alias", 0, equalTo("<em>foo</em>"));
     }
 
-    public void testHighlightingWhenFieldsAreNotStoredThereIsNoSource() throws IOException {
+    public void testHighlightingWhenFieldsAreNotStoredThereIsNoSource() throws IOException, InterruptedException {
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();
         mappings.startObject("_source")
@@ -334,6 +338,7 @@ public void testHighlightingWhenFieldsAreNotStoredThereIsNoSource() throws IOExc
             .setSource(jsonBuilder().startObject().field("unstored_text", "text").field("text", "text").endObject())
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         for (String type : ALL_TYPES) {
             SearchResponse search = client().prepareSearch()
                 .setQuery(constantScoreQuery(matchQuery("text", "text")))
@@ -350,7 +355,7 @@ public void testHighlightingWhenFieldsAreNotStoredThereIsNoSource() throws IOExc
     }
 
     // see #3486
-    public void testHighTermFrequencyDoc() throws IOException {
+    public void testHighTermFrequencyDoc() throws IOException, InterruptedException {
         assertAcked(prepareCreate("test").setMapping("name", "type=text,term_vector=with_positions_offsets,store=" + randomBoolean()));
         StringBuilder builder = new StringBuilder();
         for (int i = 0; i < 6000; i++) {
@@ -358,6 +363,7 @@ public void testHighTermFrequencyDoc() throws IOException {
         }
         client().prepareIndex("test").setId("1").setSource("name", builder.toString()).get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         SearchResponse search = client().prepareSearch()
             .setQuery(constantScoreQuery(matchQuery("name", "abc")))
             .highlighter(new HighlightBuilder().field("name"))
@@ -385,6 +391,7 @@ public void testEnsureNoNegativeOffsets() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         SearchResponse search = client().prepareSearch()
             .setQuery(matchQuery("long_term", "thisisaverylongwordandmakessurethisfails foo highlighed"))
             .highlighter(new HighlightBuilder().field("long_term", 18, 1).highlighterType("fvh"))
@@ -671,7 +678,7 @@ public void testHighlightIssue1994() throws Exception {
         assertHighlight(search, 0, "titleTV", 1, 2, equalTo("<em>highlight</em> other text"));
     }
 
-    public void testGlobalHighlightingSettingsOverriddenAtFieldLevel() {
+    public void testGlobalHighlightingSettingsOverriddenAtFieldLevel() throws InterruptedException {
         createIndex("test");
         ensureGreen();
 
@@ -684,6 +691,7 @@ public void testGlobalHighlightingSettingsOverriddenAtFieldLevel() {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1 and field2 produces different tags");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "test"))
@@ -734,6 +742,7 @@ public void testHighlightingOnWildcardFields() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field*");
         SearchSourceBuilder source = searchSource()
@@ -783,6 +792,7 @@ public void testForceSourceWithSourceDisabled() throws Exception {
             .setSource("field1", "The quick brown fox jumps over the lazy dog", "field2", "second field content")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // works using stored field
         SearchResponse searchResponse = client().prepareSearch("test")
@@ -823,6 +833,7 @@ public void testPlainHighlighter() throws Exception {
 
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "test"))
@@ -1025,6 +1036,7 @@ public void testFVHManyMatches() throws Exception {
         String value = new String(new char[1024 * 256 / pattern.length()]).replace("\0", pattern);
         client().prepareIndex("test").setSource("field1", value).get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1 with default phrase limit");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "t"))
@@ -1116,6 +1128,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
         );
         index("test", "type1", "3", "foo", "weird", "bar", "result");
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         Field fooField = new Field("foo").numOfFragments(1)
             .order("score")
@@ -1408,6 +1421,7 @@ public void testMultiMapperVectorWithStore() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource("title", "this is a test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse search = client().prepareSearch()
@@ -1453,6 +1467,7 @@ public void testMultiMapperVectorFromSource() throws Exception {
 
         client().prepareIndex("test").setId("1").setSource("title", "this is a test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse search = client().prepareSearch()
@@ -1498,6 +1513,7 @@ public void testMultiMapperNoVectorWithStore() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource("title", "this is a test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse search = client().prepareSearch()
@@ -1542,6 +1558,7 @@ public void testMultiMapperNoVectorFromSource() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource("title", "this is a test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse search = client().prepareSearch()
@@ -1571,6 +1588,7 @@ public void testFastVectorHighlighterShouldFailIfNoTermVectors() throws Exceptio
                 .setSource("title", "This is a test for the enabling fast vector highlighter");
         }
         indexRandom(true, indexRequestBuilders);
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse search = client().prepareSearch()
             .setQuery(matchPhraseQuery("title", "this is a test"))
@@ -1608,6 +1626,7 @@ public void testDisableFastVectorHighlighter() throws Exception {
                 .setSource("title", "This is a test for the workaround for the fast vector highlighting SOLR-3724");
         }
         indexRandom(true, indexRequestBuilders);
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse search = client().prepareSearch()
             .setQuery(matchPhraseQuery("title", "test for the workaround"))
@@ -1669,6 +1688,7 @@ public void testFSHHighlightAllMvFragments() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("tags", "tag"))
@@ -1686,11 +1706,12 @@ public void testFSHHighlightAllMvFragments() throws Exception {
         );
     }
 
-    public void testBoostingQuery() {
+    public void testBoostingQuery() throws InterruptedException {
         createIndex("test");
         ensureGreen();
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -1702,11 +1723,12 @@ public void testBoostingQuery() {
         assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick <x>brown</x> fox jumps over the lazy dog"));
     }
 
-    public void testBoostingQueryTermVector() throws IOException {
+    public void testBoostingQueryTermVector() throws IOException, InterruptedException {
         assertAcked(prepareCreate("test").setMapping(type1TermVectorMapping()));
         ensureGreen();
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -1718,12 +1740,13 @@ public void testBoostingQueryTermVector() throws IOException {
         assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick <x>brown</x> fox jumps over the lazy dog"));
     }
 
-    public void testCommonTermsQuery() {
+    public void testCommonTermsQuery() throws InterruptedException {
         createIndex("test");
         ensureGreen();
 
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(commonTermsQuery("field2", "quick brown").cutoffFrequency(100))
@@ -1733,12 +1756,13 @@ public void testCommonTermsQuery() {
         assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
     }
 
-    public void testCommonTermsTermVector() throws IOException {
+    public void testCommonTermsTermVector() throws IOException, InterruptedException {
         assertAcked(prepareCreate("test").setMapping(type1TermVectorMapping()));
         ensureGreen();
 
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(commonTermsQuery("field2", "quick brown").cutoffFrequency(100))
             .highlighter(highlight().field("field2").order("score").preTags("<x>").postTags("</x>"));
@@ -1764,6 +1788,7 @@ public void testPlainHighlightDifferentFragmenter() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchPhraseQuery("tags", "long tag"))
@@ -1816,12 +1841,13 @@ public void testPlainHighlightDifferentFragmenter() throws Exception {
         );
     }
 
-    public void testPlainHighlighterMultipleFields() {
+    public void testPlainHighlighterMultipleFields() throws InterruptedException {
         createIndex("test");
         ensureGreen();
 
         index("test", "type1", "1", "field1", "The <b>quick<b> brown fox", "field2", "The <b>slow<b> brown fox");
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("field1", "fox"))
@@ -1834,7 +1860,7 @@ public void testPlainHighlighterMultipleFields() {
         assertHighlight(response, 0, "field2", 0, 1, equalTo("The <b>slow<b> brown <2>fox</2>"));
     }
 
-    public void testFastVectorHighlighterMultipleFields() {
+    public void testFastVectorHighlighterMultipleFields() throws InterruptedException {
         assertAcked(
             prepareCreate("test").setMapping(
                 "field1",
@@ -1847,6 +1873,7 @@ public void testFastVectorHighlighterMultipleFields() {
 
         index("test", "type1", "1", "field1", "The <b>quick<b> brown fox", "field2", "The <b>slow<b> brown fox");
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("field1", "fox"))
@@ -1864,6 +1891,7 @@ public void testMissingStoredField() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource(jsonBuilder().startObject().field("field", "highlight").endObject()).get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // This query used to fail when the field to highlight was absent
         SearchResponse response = client().prepareSearch("test")
@@ -1904,6 +1932,7 @@ public void testNumericHighlighting() throws Exception {
             .setSource("text", "opensearch test", "byte", 25, "short", 42, "int", 100, "long", -1, "float", 3.2f, "double", 42.42)
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("text", "test"))
@@ -1926,6 +1955,7 @@ public void testResetTwice() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource("text", "opensearch test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("text", "test"))
@@ -1935,7 +1965,7 @@ public void testResetTwice() throws Exception {
         assertHitCount(response, 1L);
     }
 
-    public void testHighlightUsesHighlightQuery() throws IOException {
+    public void testHighlightUsesHighlightQuery() throws IOException, InterruptedException {
         assertAcked(
             prepareCreate("test").setMapping(
                 "text",
@@ -1946,6 +1976,7 @@ public void testHighlightUsesHighlightQuery() throws IOException {
 
         index("test", "type1", "1", "text", "Testing the highlight query feature");
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         for (String type : ALL_TYPES) {
             HighlightBuilder.Field field = new HighlightBuilder.Field("text");
@@ -1981,7 +2012,11 @@ private static String randomStoreField() {
         return "";
     }
 
-    public void testHighlightNoMatchSize() throws IOException {
+    public void testHighlightNoMatchSize() throws IOException, InterruptedException {
+        assumeFalse(
+            "Concurrent search case muted pending fix: https://github.com/opensearch-project/OpenSearch/issues/10900",
+            internalCluster().clusterService().getClusterSettings().get(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING)
+        );
         assertAcked(
             prepareCreate("test").setMapping(
                 "text",
@@ -1993,6 +2028,7 @@ public void testHighlightNoMatchSize() throws IOException {
         String text = "I am pretty long so some of me should get cut off. Second sentence";
         index("test", "type1", "1", "text", text);
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // When you don't set noMatchSize you don't get any results if there isn't anything to highlight.
         HighlightBuilder.Field field = new HighlightBuilder.Field("text").fragmentSize(21).numOfFragments(1).highlighterType("plain");
@@ -2091,7 +2127,11 @@ public void testHighlightNoMatchSize() throws IOException {
         assertNotHighlighted(response, 0, "text");
     }
 
-    public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
+    public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException, InterruptedException {
+        assumeFalse(
+            "Concurrent search case muted pending fix: https://github.com/opensearch-project/OpenSearch/issues/10900",
+            internalCluster().clusterService().getClusterSettings().get(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING)
+        );
         assertAcked(
             prepareCreate("test").setMapping(
                 "text",
@@ -2104,6 +2144,7 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
         String text2 = "I am short";
         index("test", "type1", "1", "text", new String[] { text1, text2 });
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // The no match fragment should come from the first value of a multi-valued field
         HighlightBuilder.Field field = new HighlightBuilder.Field("text").fragmentSize(21)
@@ -2186,7 +2227,11 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
         assertNotHighlighted(response, 0, "text");
     }
 
-    public void testHighlightNoMatchSizeNumberOfFragments() throws IOException {
+    public void testHighlightNoMatchSizeNumberOfFragments() throws IOException, InterruptedException {
+        assumeFalse(
+            "Concurrent search case muted pending fix: https://github.com/opensearch-project/OpenSearch/issues/10900",
+            internalCluster().clusterService().getClusterSettings().get(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING)
+        );
         assertAcked(
             prepareCreate("test").setMapping(
                 "text",
@@ -2200,6 +2245,7 @@ public void testHighlightNoMatchSizeNumberOfFragments() throws IOException {
         String text3 = "This is the fifth sentence";
         index("test", "type1", "1", "text", new String[] { text1, text2, text3 });
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // The no match fragment should come from the first value of a multi-valued field
         HighlightBuilder.Field field = new HighlightBuilder.Field("text").fragmentSize(1)
@@ -2243,6 +2289,7 @@ public void testPostingsHighlighter() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy quick dog")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "test"))
@@ -2320,6 +2367,7 @@ public void testPostingsHighlighterMultipleFields() throws Exception {
             "The <b>slow<b> brown fox. Second sentence."
         );
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse response = client().prepareSearch("test")
             .setQuery(QueryBuilders.matchQuery("field1", "fox"))
@@ -2344,6 +2392,7 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "fox"))
@@ -2376,6 +2425,7 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         source = searchSource().query(termQuery("field1", "fox"))
             .highlighter(highlight().field(new Field("field1").numOfFragments(0).preTags("<field1>").postTags("</field1>")));
@@ -2412,7 +2462,7 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception {
         }
     }
 
-    public void testMultiMatchQueryHighlight() throws IOException {
+    public void testMultiMatchQueryHighlight() throws IOException, InterruptedException {
         XContentBuilder mapping = XContentFactory.jsonBuilder()
             .startObject()
             .startObject("properties")
@@ -2434,6 +2484,7 @@ public void testMultiMatchQueryHighlight() throws IOException {
             .setSource("field1", "The quick brown fox jumps over", "field2", "The quick brown fox jumps over")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         final int iters = scaledRandomIntBetween(20, 30);
         for (int i = 0; i < iters; i++) {
             String highlighterType = rarely() ? null : RandomPicks.randomFrom(random(), ALL_TYPES);
@@ -2479,6 +2530,7 @@ public void testPostingsHighlighterOrderByScore() throws Exception {
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(termQuery("field1", "sentence"))
@@ -2565,6 +2617,7 @@ public void testPostingsHighlighterMultiMapperWithStore() throws Exception {
         ensureGreen();
         client().prepareIndex("test").setId("1").setSource("title", "this is a test . Second sentence.").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse searchResponse = client().prepareSearch()
@@ -2623,6 +2676,7 @@ public void testPostingsHighlighterMultiMapperFromSource() throws Exception {
 
         client().prepareIndex("test").setId("1").setSource("title", "this is a test").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // simple search on body with standard analyzer with a simple field query
         SearchResponse searchResponse = client().prepareSearch()
@@ -2672,13 +2726,14 @@ public void testPostingsHighlighterShouldFailIfNoOffsets() throws Exception {
         assertNoFailures(search);
     }
 
-    public void testPostingsHighlighterBoostingQuery() throws IOException {
+    public void testPostingsHighlighterBoostingQuery() throws IOException, InterruptedException {
         assertAcked(prepareCreate("test").setMapping(type1PostingsffsetsMapping()));
         ensureGreen();
         client().prepareIndex("test")
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -2689,7 +2744,7 @@ public void testPostingsHighlighterBoostingQuery() throws IOException {
         assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick <x>brown</x> fox jumps over the lazy dog! Second sentence."));
     }
 
-    public void testPostingsHighlighterCommonTermsQuery() throws IOException {
+    public void testPostingsHighlighterCommonTermsQuery() throws IOException, InterruptedException {
         assertAcked(prepareCreate("test").setMapping(type1PostingsffsetsMapping()));
         ensureGreen();
 
@@ -2697,6 +2752,7 @@ public void testPostingsHighlighterCommonTermsQuery() throws IOException {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(commonTermsQuery("field2", "quick brown").cutoffFrequency(100))
@@ -2738,6 +2794,7 @@ public void testPostingsHighlighterPrefixQuery() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         logger.info("--> highlighting and searching on field2");
 
         SearchSourceBuilder source = searchSource().query(prefixQuery("field2", "qui")).highlighter(highlight().field("field2"));
@@ -2760,6 +2817,7 @@ public void testPostingsHighlighterFuzzyQuery() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field2");
         SearchSourceBuilder source = searchSource().query(fuzzyQuery("field2", "quck")).highlighter(highlight().field("field2"));
@@ -2783,6 +2841,7 @@ public void testPostingsHighlighterRegexpQuery() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field2");
         SearchSourceBuilder source = searchSource().query(regexpQuery("field2", "qu[a-l]+k")).highlighter(highlight().field("field2"));
@@ -2806,6 +2865,7 @@ public void testPostingsHighlighterWildcardQuery() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field2");
         SearchSourceBuilder source = searchSource().query(wildcardQuery("field2", "qui*")).highlighter(highlight().field("field2"));
@@ -2840,6 +2900,7 @@ public void testPostingsHighlighterTermRangeQuery() throws Exception {
 
         client().prepareIndex("test").setSource("field1", "this is a test", "field2", "aaab").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field2");
         SearchSourceBuilder source = searchSource().query(rangeQuery("field2").gte("aaaa").lt("zzzz"))
@@ -2857,6 +2918,7 @@ public void testPostingsHighlighterQueryString() throws Exception {
             .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.")
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field2");
         SearchSourceBuilder source = searchSource().query(queryStringQuery("qui*").defaultField("field2"))
@@ -2878,6 +2940,7 @@ public void testPostingsHighlighterRegexpQueryWithinConstantScoreQuery() throws
 
         client().prepareIndex("test").setSource("field1", "The photography word will get highlighted").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(constantScoreQuery(regexpQuery("field1", "pho[a-z]+")))
@@ -2892,6 +2955,7 @@ public void testPostingsHighlighterMultiTermQueryMultipleLevels() throws Excepti
 
         client().prepareIndex("test").setSource("field1", "The photography word will get highlighted").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -2909,6 +2973,7 @@ public void testPostingsHighlighterPrefixQueryWithinBooleanQuery() throws Except
 
         client().prepareIndex("test").setSource("field1", "The photography word will get highlighted").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -2924,6 +2989,7 @@ public void testPostingsHighlighterQueryStringWithinFilteredQuery() throws Excep
 
         client().prepareIndex("test").setSource("field1", "The photography word will get highlighted").get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         logger.info("--> highlighting and searching on field1");
         SearchSourceBuilder source = searchSource().query(
@@ -3028,7 +3094,7 @@ public void testFastVectorHighlighterPhraseBoost() throws Exception {
      * because it doesn't support the concept of terms having a different weight based on position.
      * @param highlighterType highlighter to test
      */
-    private void phraseBoostTestCase(String highlighterType) {
+    private void phraseBoostTestCase(String highlighterType) throws InterruptedException {
         ensureGreen();
         StringBuilder text = new StringBuilder();
         text.append("words words junk junk junk junk junk junk junk junk highlight junk junk junk junk together junk\n");
@@ -3041,6 +3107,7 @@ private void phraseBoostTestCase(String highlighterType) {
         }
         index("test", "type1", "1", "field1", text.toString());
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         // Match queries
         phraseBoostTestCaseForClauses(
@@ -3109,7 +3176,7 @@ private <P extends AbstractQueryBuilder<P>> void phraseBoostTestCaseForClauses(
         assertHighlight(response, 0, "field1", 0, 1, highlightedMatcher);
     }
 
-    public void testGeoFieldHighlightingWithDifferentHighlighters() throws IOException {
+    public void testGeoFieldHighlightingWithDifferentHighlighters() throws IOException, InterruptedException {
         // check that we do not get an exception for geo_point fields in case someone tries to highlight
         // it accidentially with a wildcard
         // see https://github.com/elastic/elasticsearch/issues/17537
@@ -3133,6 +3200,7 @@ public void testGeoFieldHighlightingWithDifferentHighlighters() throws IOExcepti
             .setSource(jsonBuilder().startObject().field("text", "Arbitrary text field which will should not cause a failure").endObject())
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         String highlighterType = randomFrom(ALL_TYPES);
         QueryBuilder query = QueryBuilders.boolQuery()
             .should(
@@ -3150,7 +3218,7 @@ public void testGeoFieldHighlightingWithDifferentHighlighters() throws IOExcepti
         assertThat(search.getHits().getAt(0).getHighlightFields().get("text").fragments().length, equalTo(1));
     }
 
-    public void testGeoFieldHighlightingWhenQueryGetsRewritten() throws IOException {
+    public void testGeoFieldHighlightingWhenQueryGetsRewritten() throws IOException, InterruptedException {
         // same as above but in this example the query gets rewritten during highlighting
         // see https://github.com/elastic/elasticsearch/issues/17537#issuecomment-244939633
         XContentBuilder mappings = jsonBuilder();
@@ -3177,6 +3245,7 @@ public void testGeoFieldHighlightingWhenQueryGetsRewritten() throws IOException
             )
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
 
         QueryBuilder query = QueryBuilders.functionScoreQuery(
             QueryBuilders.boolQuery()
@@ -3192,7 +3261,7 @@ public void testGeoFieldHighlightingWhenQueryGetsRewritten() throws IOException
         assertThat(search.getHits().getTotalHits().value, equalTo(1L));
     }
 
-    public void testKeywordFieldHighlighting() throws IOException {
+    public void testKeywordFieldHighlighting() throws IOException, InterruptedException {
         // check that keyword highlighting works
         XContentBuilder mappings = jsonBuilder();
         mappings.startObject();
@@ -3205,6 +3274,7 @@ public void testKeywordFieldHighlighting() throws IOException {
             .setSource(jsonBuilder().startObject().field("keyword_field", "some text").endObject())
             .get();
         refresh();
+        indexRandomForConcurrentSearch("test");
         SearchResponse search = client().prepareSearch()
             .setSource(
                 new SearchSourceBuilder().query(QueryBuilders.matchQuery("keyword_field", "some text"))
@@ -3238,6 +3308,7 @@ public void testCopyToFields() throws Exception {
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
 
+        indexRandomForConcurrentSearch("test");
         SearchResponse response = client().prepareSearch()
             .setQuery(matchQuery("foo_copy", "brown"))
             .highlighter(new HighlightBuilder().field(new Field("foo_copy")))
@@ -3287,7 +3358,7 @@ public void testACopyFieldWithNestedQuery() throws Exception {
             )
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
-
+        indexRandomForConcurrentSearch("test");
         SearchResponse searchResponse = client().prepareSearch()
             .setQuery(nestedQuery("foo", matchQuery("foo.text", "brown cow"), ScoreMode.None))
             .highlighter(new HighlightBuilder().field(new Field("foo_text").highlighterType("fvh")).requireFieldMatch(false))
@@ -3305,6 +3376,7 @@ public void testFunctionScoreQueryHighlight() throws Exception {
             .setSource(jsonBuilder().startObject().field("text", "brown").endObject())
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
+        indexRandomForConcurrentSearch("test");
 
         SearchResponse searchResponse = client().prepareSearch()
             .setQuery(new FunctionScoreQueryBuilder(QueryBuilders.prefixQuery("text", "bro")))
@@ -3322,6 +3394,7 @@ public void testFiltersFunctionScoreQueryHighlight() throws Exception {
             .setSource(jsonBuilder().startObject().field("text", "brown").field("enable", "yes").endObject())
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
+        indexRandomForConcurrentSearch("test");
         FunctionScoreQueryBuilder.FilterFunctionBuilder filterBuilder = new FunctionScoreQueryBuilder.FilterFunctionBuilder(
             QueryBuilders.termQuery("enable", "yes"),
             new RandomScoreFunctionBuilder()
@@ -3420,6 +3493,7 @@ public void testWithNestedQuery() throws Exception {
             )
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
+        indexRandomForConcurrentSearch("test");
 
         for (String type : new String[] { "unified", "plain" }) {
             SearchResponse searchResponse = client().prepareSearch()
@@ -3477,6 +3551,7 @@ public void testWithNormalizer() throws Exception {
             .setSource("keyword", "Hello World")
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
+        indexRandomForConcurrentSearch("test");
 
         for (String highlighterType : new String[] { "unified", "plain" }) {
             SearchResponse searchResponse = client().prepareSearch()
@@ -3499,6 +3574,7 @@ public void testDisableHighlightIdField() throws Exception {
             .setSource("keyword", "Hello World")
             .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
             .get();
+        indexRandomForConcurrentSearch("test");
 
         for (String highlighterType : new String[] { "plain", "unified" }) {
             SearchResponse searchResponse = client().prepareSearch()
diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
index ad27d9834f159..0c6c81103922f 100644
--- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
@@ -1666,6 +1666,11 @@ public void indexRandom(boolean forceRefresh, boolean dummyDocuments, boolean ma
             }
         }
         assertThat(actualErrors, emptyIterable());
+
+        if (dummyDocuments) {
+            bogusIds.addAll(indexRandomForMultipleSlices(indicesArray));
+        }
+
         if (!bogusIds.isEmpty()) {
             // delete the bogus types again - it might trigger merges or at least holes in the segments and enforces deleted docs!
             for (List<String> doc : bogusIds) {
@@ -1683,6 +1688,52 @@ public void indexRandom(boolean forceRefresh, boolean dummyDocuments, boolean ma
         }
     }
 
+    /*
+    * This method ingests bogus documents for the given indices such that multiple slices
+    * are formed. This is useful for testing with the concurrent search use-case as it creates
+    * multiple slices based on segment count.
+    * @param indices         the indices in which bogus documents should be ingested
+    * */
+    protected Set<List<String>> indexRandomForMultipleSlices(String... indices) throws InterruptedException {
+        Set<List<String>> bogusIds = new HashSet<>();
+        int refreshCount = randomIntBetween(2, 3);
+        for (String index : indices) {
+            int numDocs = getNumShards(index).totalNumShards * randomIntBetween(2, 10);
+            while (refreshCount-- > 0) {
+                final CopyOnWriteArrayList<Tuple<IndexRequestBuilder, Exception>> errors = new CopyOnWriteArrayList<>();
+                List<CountDownLatch> inFlightAsyncOperations = new ArrayList<>();
+                for (int i = 0; i < numDocs; i++) {
+                    String id = "bogus_doc_" + randomRealisticUnicodeOfLength(between(1, 10)) + dummmyDocIdGenerator.incrementAndGet();
+                    IndexRequestBuilder indexRequestBuilder = client().prepareIndex()
+                        .setIndex(index)
+                        .setId(id)
+                        .setSource("{}", MediaTypeRegistry.JSON)
+                        .setRouting(id);
+                    indexRequestBuilder.execute(
+                        new PayloadLatchedActionListener<>(indexRequestBuilder, newLatch(inFlightAsyncOperations), errors)
+                    );
+                    bogusIds.add(Arrays.asList(index, id));
+                }
+                for (CountDownLatch operation : inFlightAsyncOperations) {
+                    operation.await();
+                }
+                final List<Exception> actualErrors = new ArrayList<>();
+                for (Tuple<IndexRequestBuilder, Exception> tuple : errors) {
+                    Throwable t = ExceptionsHelper.unwrapCause(tuple.v2());
+                    if (t instanceof OpenSearchRejectedExecutionException) {
+                        logger.debug("Error indexing doc: " + t.getMessage() + ", reindexing.");
+                        tuple.v1().execute().actionGet(); // re-index if rejected
+                    } else {
+                        actualErrors.add(tuple.v2());
+                    }
+                }
+                assertThat(actualErrors, emptyIterable());
+                refresh(index);
+            }
+        }
+        return bogusIds;
+    }
+
     private final AtomicInteger dummmyDocIdGenerator = new AtomicInteger();
 
     /** Disables an index block for the specified index */
diff --git a/test/framework/src/main/java/org/opensearch/test/ParameterizedOpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/ParameterizedOpenSearchIntegTestCase.java
index 636064d8e4f9d..f8813a8c5afa9 100644
--- a/test/framework/src/main/java/org/opensearch/test/ParameterizedOpenSearchIntegTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/test/ParameterizedOpenSearchIntegTestCase.java
@@ -13,6 +13,8 @@
 import org.junit.After;
 import org.junit.Before;
 
+import static org.opensearch.search.SearchService.CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING;
+
 /**
  * Base class for running the tests with parameterization of the dynamic settings
  * For any class that wants to use parameterization, use @ParametersFactory to generate
@@ -44,4 +46,10 @@ public void afterTests() {
         dynamicSettings.keySet().forEach(settingsToUnset::putNull);
         client().admin().cluster().prepareUpdateSettings().setPersistentSettings(settingsToUnset).get();
     }
+
+    public void indexRandomForConcurrentSearch(String... indices) throws InterruptedException {
+        if (dynamicSettings.get(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING.getKey()).equals("true")) {
+            indexRandomForMultipleSlices(indices);
+        }
+    }
 }

From 45f7be1c603d13a1ce48e5da449ae4610ba04e52 Mon Sep 17 00:00:00 2001
From: Gaurav Bafna <85113518+gbbafna@users.noreply.github.com>
Date: Sat, 28 Oct 2023 15:43:14 +0530
Subject: [PATCH 26/33] =?UTF-8?q?Ensure=20that=20segments=20are=20upload?=
 =?UTF-8?q?=20to=20remote=20store=20in=20case=20of=20local=20and=20?=
 =?UTF-8?q?=E2=80=A6=20(#10948)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---------

Signed-off-by: Gaurav Bafna <gbbafna@amazon.com>
---
 .../indices/create/RemoteCloneIndexIT.java    | 133 +++++
 .../indices/create/RemoteShrinkIndexIT.java   | 545 ++++++++++++++++++
 .../indices/create/RemoteSplitIndexIT.java    | 506 ++++++++++++++++
 .../remotestore/RemoteRestoreSnapshotIT.java  |  43 +-
 .../RemoteStoreBaseIntegTestCase.java         |   2 +-
 .../opensearch/index/shard/IndexShard.java    |  24 +
 .../shard/RemoteStoreRefreshListener.java     |  22 +-
 .../opensearch/index/shard/StoreRecovery.java |  21 +
 .../index/shard/IndexShardTests.java          |   1 +
 9 files changed, 1278 insertions(+), 19 deletions(-)
 create mode 100644 server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java
 create mode 100644 server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteShrinkIndexIT.java
 create mode 100644 server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteSplitIndexIT.java

diff --git a/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java
new file mode 100644
index 0000000000000..a081110e6c5a1
--- /dev/null
+++ b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java
@@ -0,0 +1,133 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.action.admin.indices.create;
+
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+import org.opensearch.Version;
+import org.opensearch.action.admin.indices.settings.get.GetSettingsResponse;
+import org.opensearch.action.admin.indices.shrink.ResizeType;
+import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
+import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.core.xcontent.MediaTypeRegistry;
+import org.opensearch.index.query.TermsQueryBuilder;
+import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase;
+import org.opensearch.test.VersionUtils;
+
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertHitCount;
+import static org.hamcrest.Matchers.equalTo;
+
+public class RemoteCloneIndexIT extends RemoteStoreBaseIntegTestCase {
+
+    @Override
+    protected boolean forbidPrivateIndexSettings() {
+        return false;
+    }
+
+    public void testCreateCloneIndex() {
+        Version version = VersionUtils.randomIndexCompatibleVersion(random());
+        int numPrimaryShards = randomIntBetween(1, 5);
+        prepareCreate("source").setSettings(
+            Settings.builder().put(indexSettings()).put("number_of_shards", numPrimaryShards).put("index.version.created", version)
+        ).get();
+        final int docs = randomIntBetween(0, 128);
+        for (int i = 0; i < docs; i++) {
+            client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+        // relocate all shards to one node such that we can merge it.
+        client().admin().indices().prepareUpdateSettings("source").setSettings(Settings.builder().put("index.blocks.write", true)).get();
+        ensureGreen();
+
+        final IndicesStatsResponse sourceStats = client().admin().indices().prepareStats("source").setSegments(true).get();
+
+        // disable rebalancing to be able to capture the right stats. balancing can move the target primary
+        // making it hard to pin point the source shards.
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none"))
+            .get();
+        try {
+            assertAcked(
+                client().admin()
+                    .indices()
+                    .prepareResizeIndex("source", "target")
+                    .setResizeType(ResizeType.CLONE)
+                    .setSettings(Settings.builder().put("index.number_of_replicas", 0).putNull("index.blocks.write").build())
+                    .get()
+            );
+            ensureGreen();
+
+            final IndicesStatsResponse targetStats = client().admin().indices().prepareStats("target").get();
+            assertThat(targetStats.getIndex("target").getIndexShards().keySet().size(), equalTo(numPrimaryShards));
+
+            final int size = docs > 0 ? 2 * docs : 1;
+            assertHitCount(client().prepareSearch("target").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+
+            for (int i = docs; i < 2 * docs; i++) {
+                client().prepareIndex("target").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+            }
+            flushAndRefresh();
+            assertHitCount(
+                client().prepareSearch("target").setSize(2 * size).setQuery(new TermsQueryBuilder("foo", "bar")).get(),
+                2 * docs
+            );
+            assertHitCount(client().prepareSearch("source").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+            GetSettingsResponse target = client().admin().indices().prepareGetSettings("target").get();
+            assertEquals(version, target.getIndexToSettings().get("target").getAsVersion("index.version.created", null));
+        } finally {
+            // clean up
+            client().admin()
+                .cluster()
+                .prepareUpdateSettings()
+                .setTransientSettings(
+                    Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), (String) null)
+                )
+                .get();
+        }
+
+    }
+
+}
diff --git a/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteShrinkIndexIT.java b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteShrinkIndexIT.java
new file mode 100644
index 0000000000000..282eb9c6ad95e
--- /dev/null
+++ b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteShrinkIndexIT.java
@@ -0,0 +1,545 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.action.admin.indices.create;
+
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.SortedSetSortField;
+import org.apache.lucene.util.Constants;
+import org.opensearch.Version;
+import org.opensearch.action.admin.cluster.reroute.ClusterRerouteResponse;
+import org.opensearch.action.admin.cluster.state.ClusterStateRequest;
+import org.opensearch.action.admin.cluster.state.ClusterStateResponse;
+import org.opensearch.action.admin.indices.settings.get.GetSettingsResponse;
+import org.opensearch.action.admin.indices.stats.CommonStats;
+import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
+import org.opensearch.action.admin.indices.stats.ShardStats;
+import org.opensearch.action.index.IndexRequest;
+import org.opensearch.action.support.ActiveShardCount;
+import org.opensearch.client.Client;
+import org.opensearch.cluster.ClusterInfoService;
+import org.opensearch.cluster.ClusterState;
+import org.opensearch.cluster.InternalClusterInfoService;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.cluster.node.DiscoveryNode;
+import org.opensearch.cluster.routing.Murmur3HashFunction;
+import org.opensearch.cluster.routing.RoutingTable;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.UnassignedInfo;
+import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
+import org.opensearch.common.Priority;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.common.unit.TimeValue;
+import org.opensearch.core.index.Index;
+import org.opensearch.core.xcontent.MediaTypeRegistry;
+import org.opensearch.index.IndexModule;
+import org.opensearch.index.IndexService;
+import org.opensearch.index.engine.SegmentsStats;
+import org.opensearch.index.query.TermsQueryBuilder;
+import org.opensearch.index.seqno.SeqNoStats;
+import org.opensearch.index.shard.IndexShard;
+import org.opensearch.indices.IndicesService;
+import org.opensearch.indices.replication.common.ReplicationType;
+import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase;
+import org.opensearch.test.VersionUtils;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.stream.IntStream;
+
+import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertHitCount;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+
+public class RemoteShrinkIndexIT extends RemoteStoreBaseIntegTestCase {
+    @Override
+    protected boolean forbidPrivateIndexSettings() {
+        return false;
+    }
+
+    public Settings indexSettings() {
+        return Settings.builder()
+            .put(super.indexSettings())
+            .put(IndexModule.INDEX_QUERY_CACHE_ENABLED_SETTING.getKey(), false)
+            .put(SETTING_NUMBER_OF_REPLICAS, 0)
+            .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
+            .build();
+    }
+
+    public void testCreateShrinkIndexToN() {
+
+        assumeFalse("https://github.com/elastic/elasticsearch/issues/34080", Constants.WINDOWS);
+
+        int[][] possibleShardSplits = new int[][] { { 8, 4, 2 }, { 9, 3, 1 }, { 4, 2, 1 }, { 15, 5, 1 } };
+        int[] shardSplits = randomFrom(possibleShardSplits);
+        assertEquals(shardSplits[0], (shardSplits[0] / shardSplits[1]) * shardSplits[1]);
+        assertEquals(shardSplits[1], (shardSplits[1] / shardSplits[2]) * shardSplits[2]);
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        prepareCreate("source").setSettings(Settings.builder().put(indexSettings()).put("number_of_shards", shardSplits[0])).get();
+        for (int i = 0; i < 20; i++) {
+            client().prepareIndex("source")
+                .setId(Integer.toString(i))
+                .setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON)
+                .get();
+        }
+        final Map<String, DiscoveryNode> dataNodes = client().admin().cluster().prepareState().get().getState().nodes().getDataNodes();
+        assertTrue("at least 2 nodes but was: " + dataNodes.size(), dataNodes.size() >= 2);
+        DiscoveryNode[] discoveryNodes = dataNodes.values().toArray(new DiscoveryNode[0]);
+        String mergeNode = discoveryNodes[0].getName();
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+        // relocate all shards to one node such that we can merge it.
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source")
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", mergeNode).put("index.blocks.write", true))
+            .get();
+        ensureGreen();
+        // now merge source into a 4 shard index
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("source", "first_shrink")
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .put("index.number_of_shards", shardSplits[1])
+                        .putNull("index.blocks.write")
+                        .build()
+                )
+                .get()
+        );
+        ensureGreen();
+        assertHitCount(client().prepareSearch("first_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+
+        for (int i = 0; i < 20; i++) { // now update
+            client().prepareIndex("first_shrink")
+                .setId(Integer.toString(i))
+                .setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON)
+                .get();
+        }
+        flushAndRefresh();
+        assertHitCount(client().prepareSearch("first_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+        assertHitCount(client().prepareSearch("source").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+
+        // relocate all shards to one node such that we can merge it.
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("first_shrink")
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", mergeNode).put("index.blocks.write", true))
+            .get();
+        ensureGreen();
+        // now merge source into a 2 shard index
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("first_shrink", "second_shrink")
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .put("index.number_of_shards", shardSplits[2])
+                        .putNull("index.blocks.write")
+                        .putNull("index.routing.allocation.require._name")
+                        .build()
+                )
+                .get()
+        );
+        ensureGreen();
+        assertHitCount(client().prepareSearch("second_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+        // let it be allocated anywhere and bump replicas
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("second_shrink")
+            .setSettings(Settings.builder().putNull("index.routing.allocation.include._id").put("index.number_of_replicas", 0))
+            .get();
+        ensureGreen();
+        assertHitCount(client().prepareSearch("second_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+
+        for (int i = 0; i < 20; i++) { // now update
+            client().prepareIndex("second_shrink")
+                .setId(Integer.toString(i))
+                .setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON)
+                .get();
+        }
+        flushAndRefresh();
+        assertHitCount(client().prepareSearch("second_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+        assertHitCount(client().prepareSearch("first_shrink").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+        assertHitCount(client().prepareSearch("source").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+    }
+
+    public void testShrinkIndexPrimaryTerm() throws Exception {
+        int numberOfShards = randomIntBetween(2, 20);
+        int numberOfTargetShards = randomValueOtherThanMany(n -> numberOfShards % n != 0, () -> randomIntBetween(1, numberOfShards - 1));
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        prepareCreate("source").setSettings(Settings.builder().put(indexSettings()).put("number_of_shards", numberOfShards)).get();
+
+        final Map<String, DiscoveryNode> dataNodes = client().admin().cluster().prepareState().get().getState().nodes().getDataNodes();
+        assertThat(dataNodes.size(), greaterThanOrEqualTo(2));
+        final DiscoveryNode[] discoveryNodes = dataNodes.values().toArray(new DiscoveryNode[0]);
+        final String mergeNode = discoveryNodes[0].getName();
+        // This needs more than the default timeout if a large number of shards were created.
+        ensureGreen(TimeValue.timeValueSeconds(120));
+
+        // fail random primary shards to force primary terms to increase
+        final Index source = resolveIndex("source");
+        final int iterations = scaledRandomIntBetween(0, 16);
+        for (int i = 0; i < iterations; i++) {
+            final String node = randomSubsetOf(1, internalCluster().nodesInclude("source")).get(0);
+            final IndicesService indexServices = internalCluster().getInstance(IndicesService.class, node);
+            final IndexService indexShards = indexServices.indexServiceSafe(source);
+            for (final Integer shardId : indexShards.shardIds()) {
+                final IndexShard shard = indexShards.getShard(shardId);
+                if (shard.routingEntry().primary() && randomBoolean()) {
+                    disableAllocation("source");
+                    shard.failShard("test", new Exception("test"));
+                    // this can not succeed until the shard is failed and a replica is promoted
+                    int id = 0;
+                    while (true) {
+                        // find an ID that routes to the right shard, we will only index to the shard that saw a primary failure
+                        final String s = Integer.toString(id);
+                        final int hash = Math.floorMod(Murmur3HashFunction.hash(s), numberOfShards);
+                        if (hash == shardId) {
+                            final IndexRequest request = new IndexRequest("source").id(s)
+                                .source("{ \"f\": \"" + s + "\"}", MediaTypeRegistry.JSON);
+                            client().index(request).get();
+                            break;
+                        } else {
+                            id++;
+                        }
+                    }
+                    enableAllocation("source");
+                    ensureGreen();
+                }
+            }
+        }
+
+        // relocate all shards to one node such that we can merge it.
+        final Settings.Builder prepareShrinkSettings = Settings.builder()
+            .put("index.routing.allocation.require._name", mergeNode)
+            .put("index.blocks.write", true);
+        client().admin().indices().prepareUpdateSettings("source").setSettings(prepareShrinkSettings).get();
+        ensureGreen(TimeValue.timeValueSeconds(120)); // needs more than the default to relocate many shards
+
+        final IndexMetadata indexMetadata = indexMetadata(client(), "source");
+        final long beforeShrinkPrimaryTerm = IntStream.range(0, numberOfShards).mapToLong(indexMetadata::primaryTerm).max().getAsLong();
+
+        // now merge source into target
+        final Settings shrinkSettings = Settings.builder()
+            .put("index.number_of_replicas", 0)
+            .put("index.number_of_shards", numberOfTargetShards)
+            .build();
+        assertAcked(client().admin().indices().prepareResizeIndex("source", "target").setSettings(shrinkSettings).get());
+
+        ensureGreen(TimeValue.timeValueSeconds(120));
+
+        final IndexMetadata afterShrinkIndexMetadata = indexMetadata(client(), "target");
+        for (int shardId = 0; shardId < numberOfTargetShards; shardId++) {
+            assertThat(afterShrinkIndexMetadata.primaryTerm(shardId), equalTo(beforeShrinkPrimaryTerm + 1));
+        }
+    }
+
+    private static IndexMetadata indexMetadata(final Client client, final String index) {
+        final ClusterStateResponse clusterStateResponse = client.admin().cluster().state(new ClusterStateRequest()).actionGet();
+        return clusterStateResponse.getState().metadata().index(index);
+    }
+
+    public void testCreateShrinkIndex() {
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        Version version = VersionUtils.randomVersion(random());
+        prepareCreate("source").setSettings(
+            Settings.builder().put(indexSettings()).put("number_of_shards", randomIntBetween(2, 7)).put("index.version.created", version)
+        ).get();
+        final int docs = randomIntBetween(0, 128);
+        for (int i = 0; i < docs; i++) {
+            client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        final Map<String, DiscoveryNode> dataNodes = client().admin().cluster().prepareState().get().getState().nodes().getDataNodes();
+        assertTrue("at least 2 nodes but was: " + dataNodes.size(), dataNodes.size() >= 2);
+        DiscoveryNode[] discoveryNodes = dataNodes.values().toArray(new DiscoveryNode[0]);
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+        // relocate all shards to one node such that we can merge it.
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source")
+            .setSettings(
+                Settings.builder()
+                    .put("index.routing.allocation.require._name", discoveryNodes[0].getName())
+                    .put("index.blocks.write", true)
+            )
+            .get();
+        ensureGreen();
+
+        final IndicesStatsResponse sourceStats = client().admin().indices().prepareStats("source").setSegments(true).get();
+
+        // disable rebalancing to be able to capture the right stats. balancing can move the target primary
+        // making it hard to pin point the source shards.
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none"))
+            .get();
+
+        // now merge source into a single shard index
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("source", "target")
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .putNull("index.blocks.write")
+                        .putNull("index.routing.allocation.require._name")
+                        .build()
+                )
+                .get()
+        );
+        ensureGreen();
+
+        // resolve true merge node - this is not always the node we required as all shards may be on another node
+        final ClusterState state = client().admin().cluster().prepareState().get().getState();
+        DiscoveryNode mergeNode = state.nodes().get(state.getRoutingTable().index("target").shard(0).primaryShard().currentNodeId());
+        logger.info("merge node {}", mergeNode);
+
+        final long maxSeqNo = Arrays.stream(sourceStats.getShards())
+            .filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId()))
+            .map(ShardStats::getSeqNoStats)
+            .mapToLong(SeqNoStats::getMaxSeqNo)
+            .max()
+            .getAsLong();
+        final long maxUnsafeAutoIdTimestamp = Arrays.stream(sourceStats.getShards())
+            .filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId()))
+            .map(ShardStats::getStats)
+            .map(CommonStats::getSegments)
+            .mapToLong(SegmentsStats::getMaxUnsafeAutoIdTimestamp)
+            .max()
+            .getAsLong();
+
+        final IndicesStatsResponse targetStats = client().admin().indices().prepareStats("target").get();
+        for (final ShardStats shardStats : targetStats.getShards()) {
+            final SeqNoStats seqNoStats = shardStats.getSeqNoStats();
+            final ShardRouting shardRouting = shardStats.getShardRouting();
+            assertThat("failed on " + shardRouting, seqNoStats.getMaxSeqNo(), equalTo(maxSeqNo));
+            assertThat("failed on " + shardRouting, seqNoStats.getLocalCheckpoint(), equalTo(maxSeqNo));
+            assertThat(
+                "failed on " + shardRouting,
+                shardStats.getStats().getSegments().getMaxUnsafeAutoIdTimestamp(),
+                equalTo(maxUnsafeAutoIdTimestamp)
+            );
+        }
+
+        final int size = docs > 0 ? 2 * docs : 1;
+        assertHitCount(client().prepareSearch("target").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+
+        for (int i = docs; i < 2 * docs; i++) {
+            client().prepareIndex("target").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        flushAndRefresh();
+        assertHitCount(client().prepareSearch("target").setSize(2 * size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 2 * docs);
+        assertHitCount(client().prepareSearch("source").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+        GetSettingsResponse target = client().admin().indices().prepareGetSettings("target").get();
+        assertEquals(version, target.getIndexToSettings().get("target").getAsVersion("index.version.created", null));
+
+        // clean up
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(
+                Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), (String) null)
+            )
+            .get();
+    }
+
+    /**
+     * Tests that we can manually recover from a failed allocation due to shards being moved away etc.
+     */
+    public void testCreateShrinkIndexFails() throws Exception {
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        prepareCreate("source").setSettings(
+            Settings.builder().put(indexSettings()).put("number_of_shards", randomIntBetween(2, 7)).put("number_of_replicas", 0)
+        ).get();
+        for (int i = 0; i < 20; i++) {
+            client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        final Map<String, DiscoveryNode> dataNodes = client().admin().cluster().prepareState().get().getState().nodes().getDataNodes();
+        assertTrue("at least 2 nodes but was: " + dataNodes.size(), dataNodes.size() >= 2);
+        DiscoveryNode[] discoveryNodes = dataNodes.values().toArray(new DiscoveryNode[0]);
+        String spareNode = discoveryNodes[0].getName();
+        String mergeNode = discoveryNodes[1].getName();
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+        // relocate all shards to one node such that we can merge it.
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source")
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", mergeNode).put("index.blocks.write", true))
+            .get();
+        ensureGreen();
+
+        // now merge source into a single shard index
+        client().admin()
+            .indices()
+            .prepareResizeIndex("source", "target")
+            .setWaitForActiveShards(ActiveShardCount.NONE)
+            .setSettings(
+                Settings.builder()
+                    .put("index.routing.allocation.exclude._name", mergeNode) // we manually exclude the merge node to forcefully fuck it up
+                    .put("index.number_of_replicas", 0)
+                    .put("index.allocation.max_retries", 1)
+                    .build()
+            )
+            .get();
+        client().admin().cluster().prepareHealth("target").setWaitForEvents(Priority.LANGUID).get();
+
+        // now we move all shards away from the merge node
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source")
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", spareNode).put("index.blocks.write", true))
+            .get();
+        ensureGreen("source");
+
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("target") // erase the forcefully fuckup!
+            .setSettings(Settings.builder().putNull("index.routing.allocation.exclude._name"))
+            .get();
+        // wait until it fails
+        assertBusy(() -> {
+            ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get();
+            RoutingTable routingTables = clusterStateResponse.getState().routingTable();
+            assertTrue(routingTables.index("target").shard(0).getShards().get(0).unassigned());
+            assertEquals(
+                UnassignedInfo.Reason.ALLOCATION_FAILED,
+                routingTables.index("target").shard(0).getShards().get(0).unassignedInfo().getReason()
+            );
+            assertEquals(1, routingTables.index("target").shard(0).getShards().get(0).unassignedInfo().getNumFailedAllocations());
+        });
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source") // now relocate them all to the right node
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", mergeNode))
+            .get();
+        ensureGreen("source");
+
+        final InternalClusterInfoService infoService = (InternalClusterInfoService) internalCluster().getInstance(
+            ClusterInfoService.class,
+            internalCluster().getClusterManagerName()
+        );
+        infoService.refresh();
+        // kick off a retry and wait until it's done!
+        ClusterRerouteResponse clusterRerouteResponse = client().admin().cluster().prepareReroute().setRetryFailed(true).get();
+        long expectedShardSize = clusterRerouteResponse.getState()
+            .routingTable()
+            .index("target")
+            .shard(0)
+            .getShards()
+            .get(0)
+            .getExpectedShardSize();
+        // we support the expected shard size in the allocator to sum up over the source index shards
+        assertTrue("expected shard size must be set but wasn't: " + expectedShardSize, expectedShardSize > 0);
+        ensureGreen();
+        assertHitCount(client().prepareSearch("target").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 20);
+    }
+
+    public void testCreateShrinkWithIndexSort() throws Exception {
+        SortField expectedSortField = new SortedSetSortField("id", true, SortedSetSelector.Type.MAX);
+        expectedSortField.setMissingValue(SortedSetSortField.STRING_FIRST);
+        Sort expectedIndexSort = new Sort(expectedSortField);
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        prepareCreate("source").setSettings(
+            Settings.builder()
+                .put(indexSettings())
+                .put("sort.field", "id")
+                .put("sort.order", "desc")
+                .put("number_of_shards", 8)
+                .put("number_of_replicas", 0)
+        ).setMapping("id", "type=keyword,doc_values=true").get();
+        for (int i = 0; i < 20; i++) {
+            client().prepareIndex("source")
+                .setId(Integer.toString(i))
+                .setSource("{\"foo\" : \"bar\", \"id\" : " + i + "}", MediaTypeRegistry.JSON)
+                .get();
+        }
+        final Map<String, DiscoveryNode> dataNodes = client().admin().cluster().prepareState().get().getState().nodes().getDataNodes();
+        assertTrue("at least 2 nodes but was: " + dataNodes.size(), dataNodes.size() >= 2);
+        DiscoveryNode[] discoveryNodes = dataNodes.values().toArray(new DiscoveryNode[0]);
+        String mergeNode = discoveryNodes[0].getName();
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+
+        flushAndRefresh();
+        assertSortedSegments("source", expectedIndexSort);
+
+        // relocate all shards to one node such that we can merge it.
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("source")
+            .setSettings(Settings.builder().put("index.routing.allocation.require._name", mergeNode).put("index.blocks.write", true))
+            .get();
+        ensureGreen();
+
+        // check that index sort cannot be set on the target index
+        IllegalArgumentException exc = expectThrows(
+            IllegalArgumentException.class,
+            () -> client().admin()
+                .indices()
+                .prepareResizeIndex("source", "target")
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .put("index.number_of_shards", "2")
+                        .put("index.sort.field", "foo")
+                        .build()
+                )
+                .get()
+        );
+        assertThat(exc.getMessage(), containsString("can't override index sort when resizing an index"));
+
+        // check that the index sort order of `source` is correctly applied to the `target`
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("source", "target")
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .put("index.number_of_shards", "2")
+                        .putNull("index.blocks.write")
+                        .build()
+                )
+                .get()
+        );
+        ensureGreen();
+        flushAndRefresh();
+        GetSettingsResponse settingsResponse = client().admin().indices().prepareGetSettings("target").execute().actionGet();
+        assertEquals(settingsResponse.getSetting("target", "index.sort.field"), "id");
+        assertEquals(settingsResponse.getSetting("target", "index.sort.order"), "desc");
+        assertSortedSegments("target", expectedIndexSort);
+
+        // ... and that the index sort is also applied to updates
+        for (int i = 20; i < 40; i++) {
+            client().prepareIndex("target").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        flushAndRefresh();
+        assertSortedSegments("target", expectedIndexSort);
+    }
+}
diff --git a/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteSplitIndexIT.java b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteSplitIndexIT.java
new file mode 100644
index 0000000000000..dd4252d24f314
--- /dev/null
+++ b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteSplitIndexIT.java
@@ -0,0 +1,506 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.action.admin.indices.create;
+
+import org.apache.lucene.search.join.ScoreMode;
+import org.apache.lucene.util.Constants;
+import org.opensearch.Version;
+import org.opensearch.action.admin.cluster.state.ClusterStateRequest;
+import org.opensearch.action.admin.cluster.state.ClusterStateResponse;
+import org.opensearch.action.admin.indices.settings.get.GetSettingsResponse;
+import org.opensearch.action.admin.indices.shrink.ResizeType;
+import org.opensearch.action.admin.indices.stats.CommonStats;
+import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
+import org.opensearch.action.admin.indices.stats.ShardStats;
+import org.opensearch.action.get.GetResponse;
+import org.opensearch.action.index.IndexRequest;
+import org.opensearch.action.index.IndexRequestBuilder;
+import org.opensearch.action.search.SearchResponse;
+import org.opensearch.client.Client;
+import org.opensearch.cluster.ClusterState;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.cluster.metadata.MetadataCreateIndexService;
+import org.opensearch.cluster.node.DiscoveryNode;
+import org.opensearch.cluster.routing.Murmur3HashFunction;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.common.unit.TimeValue;
+import org.opensearch.core.index.Index;
+import org.opensearch.core.xcontent.MediaTypeRegistry;
+import org.opensearch.index.IndexModule;
+import org.opensearch.index.IndexService;
+import org.opensearch.index.engine.SegmentsStats;
+import org.opensearch.index.query.TermsQueryBuilder;
+import org.opensearch.index.seqno.SeqNoStats;
+import org.opensearch.index.shard.IndexShard;
+import org.opensearch.indices.IndicesService;
+import org.opensearch.indices.replication.common.ReplicationType;
+import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase;
+import org.opensearch.test.VersionUtils;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.function.BiFunction;
+import java.util.stream.IntStream;
+
+import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
+import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.opensearch.index.query.QueryBuilders.nestedQuery;
+import static org.opensearch.index.query.QueryBuilders.termQuery;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertHitCount;
+import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertNoFailures;
+import static org.hamcrest.Matchers.equalTo;
+
+public class RemoteSplitIndexIT extends RemoteStoreBaseIntegTestCase {
+
+    @Override
+    protected boolean forbidPrivateIndexSettings() {
+        return false;
+    }
+
+    public Settings indexSettings() {
+        return Settings.builder()
+            .put(super.indexSettings())
+            .put(IndexModule.INDEX_QUERY_CACHE_ENABLED_SETTING.getKey(), false)
+            .put(SETTING_NUMBER_OF_REPLICAS, 0)
+            .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
+            .build();
+    }
+
+    public void testCreateSplitIndexToN() throws IOException {
+        int[][] possibleShardSplits = new int[][] { { 2, 4, 8 }, { 3, 6, 12 }, { 1, 2, 4 } };
+        int[] shardSplits = randomFrom(possibleShardSplits);
+        splitToN(shardSplits[0], shardSplits[1], shardSplits[2]);
+    }
+
+    public void testSplitFromOneToN() {
+
+        assumeFalse("https://github.com/elastic/elasticsearch/issues/34080", Constants.WINDOWS);
+
+        splitToN(1, 5, 10);
+        client().admin().indices().prepareDelete("*").get();
+        int randomSplit = randomIntBetween(2, 6);
+        splitToN(1, randomSplit, randomSplit * 2);
+    }
+
+    private void splitToN(int sourceShards, int firstSplitShards, int secondSplitShards) {
+
+        assertEquals(sourceShards, (sourceShards * firstSplitShards) / firstSplitShards);
+        assertEquals(firstSplitShards, (firstSplitShards * secondSplitShards) / secondSplitShards);
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        final boolean useRouting = randomBoolean();
+        final boolean useNested = randomBoolean();
+        final boolean useMixedRouting = useRouting ? randomBoolean() : false;
+        CreateIndexRequestBuilder createInitialIndex = prepareCreate("source");
+        Settings.Builder settings = Settings.builder().put(indexSettings()).put("number_of_shards", sourceShards);
+        final boolean useRoutingPartition;
+        if (randomBoolean()) {
+            // randomly set the value manually
+            int routingShards = secondSplitShards * randomIntBetween(1, 10);
+            settings.put("index.number_of_routing_shards", routingShards);
+            useRoutingPartition = false;
+        } else {
+            useRoutingPartition = randomBoolean();
+        }
+        if (useRouting && useMixedRouting == false && useRoutingPartition) {
+            int numRoutingShards = MetadataCreateIndexService.calculateNumRoutingShards(secondSplitShards, Version.CURRENT) - 1;
+            settings.put("index.routing_partition_size", randomIntBetween(1, numRoutingShards));
+            if (useNested) {
+                createInitialIndex.setMapping("_routing", "required=true", "nested1", "type=nested");
+            } else {
+                createInitialIndex.setMapping("_routing", "required=true");
+            }
+        } else if (useNested) {
+            createInitialIndex.setMapping("nested1", "type=nested");
+        }
+        logger.info("use routing {} use mixed routing {} use nested {}", useRouting, useMixedRouting, useNested);
+        createInitialIndex.setSettings(settings).get();
+
+        int numDocs = randomIntBetween(10, 50);
+        String[] routingValue = new String[numDocs];
+
+        BiFunction<String, Integer, IndexRequestBuilder> indexFunc = (index, id) -> {
+            try {
+                return client().prepareIndex(index)
+                    .setId(Integer.toString(id))
+                    .setSource(
+                        jsonBuilder().startObject()
+                            .field("foo", "bar")
+                            .field("i", id)
+                            .startArray("nested1")
+                            .startObject()
+                            .field("n_field1", "n_value1_1")
+                            .field("n_field2", "n_value2_1")
+                            .endObject()
+                            .startObject()
+                            .field("n_field1", "n_value1_2")
+                            .field("n_field2", "n_value2_2")
+                            .endObject()
+                            .endArray()
+                            .endObject()
+                    );
+            } catch (IOException e) {
+                throw new UncheckedIOException(e);
+            }
+        };
+        for (int i = 0; i < numDocs; i++) {
+            IndexRequestBuilder builder = indexFunc.apply("source", i);
+            if (useRouting) {
+                String routing = randomRealisticUnicodeOfCodepointLengthBetween(1, 10);
+                if (useMixedRouting && randomBoolean()) {
+                    routingValue[i] = null;
+                } else {
+                    routingValue[i] = routing;
+                }
+                builder.setRouting(routingValue[i]);
+            }
+            builder.get();
+        }
+
+        if (randomBoolean()) {
+            for (int i = 0; i < numDocs; i++) { // let's introduce some updates / deletes on the index
+                if (randomBoolean()) {
+                    IndexRequestBuilder builder = indexFunc.apply("source", i);
+                    if (useRouting) {
+                        builder.setRouting(routingValue[i]);
+                    }
+                    builder.get();
+                }
+            }
+        }
+
+        ensureYellow();
+        client().admin().indices().prepareUpdateSettings("source").setSettings(Settings.builder().put("index.blocks.write", true)).get();
+        ensureGreen();
+        Settings.Builder firstSplitSettingsBuilder = Settings.builder()
+            .put("index.number_of_replicas", 0)
+            .put("index.number_of_shards", firstSplitShards)
+            .putNull("index.blocks.write");
+        if (sourceShards == 1 && useRoutingPartition == false && randomBoolean()) { // try to set it if we have a source index with 1 shard
+            firstSplitSettingsBuilder.put("index.number_of_routing_shards", secondSplitShards);
+        }
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("source", "first_split")
+                .setResizeType(ResizeType.SPLIT)
+                .setSettings(firstSplitSettingsBuilder.build())
+                .get()
+        );
+        ensureGreen();
+        assertHitCount(client().prepareSearch("first_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+
+        for (int i = 0; i < numDocs; i++) { // now update
+            IndexRequestBuilder builder = indexFunc.apply("first_split", i);
+            if (useRouting) {
+                builder.setRouting(routingValue[i]);
+            }
+            builder.get();
+        }
+        flushAndRefresh();
+        assertHitCount(client().prepareSearch("first_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        assertHitCount(client().prepareSearch("source").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        for (int i = 0; i < numDocs; i++) {
+            GetResponse getResponse = client().prepareGet("first_split", Integer.toString(i)).setRouting(routingValue[i]).get();
+            assertTrue(getResponse.isExists());
+        }
+
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("first_split")
+            .setSettings(Settings.builder().put("index.blocks.write", true))
+            .get();
+        ensureGreen();
+        // now split source into a new index
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("first_split", "second_split")
+                .setResizeType(ResizeType.SPLIT)
+                .setSettings(
+                    Settings.builder()
+                        .put("index.number_of_replicas", 0)
+                        .put("index.number_of_shards", secondSplitShards)
+                        .putNull("index.blocks.write")
+                        .build()
+                )
+                .get()
+        );
+        ensureGreen();
+        assertHitCount(client().prepareSearch("second_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        // let it be allocated anywhere and bump replicas
+        client().admin()
+            .indices()
+            .prepareUpdateSettings("second_split")
+            .setSettings(Settings.builder().put("index.number_of_replicas", 0))
+            .get();
+        ensureGreen();
+        assertHitCount(client().prepareSearch("second_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+
+        for (int i = 0; i < numDocs; i++) { // now update
+            IndexRequestBuilder builder = indexFunc.apply("second_split", i);
+            if (useRouting) {
+                builder.setRouting(routingValue[i]);
+            }
+            builder.get();
+        }
+        flushAndRefresh();
+        for (int i = 0; i < numDocs; i++) {
+            GetResponse getResponse = client().prepareGet("second_split", Integer.toString(i)).setRouting(routingValue[i]).get();
+            assertTrue(getResponse.isExists());
+        }
+        assertHitCount(client().prepareSearch("second_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        assertHitCount(client().prepareSearch("first_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        assertHitCount(client().prepareSearch("source").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+        if (useNested) {
+            assertNested("source", numDocs);
+            assertNested("first_split", numDocs);
+            assertNested("second_split", numDocs);
+        }
+        assertAllUniqueDocs(
+            client().prepareSearch("second_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(),
+            numDocs
+        );
+        assertAllUniqueDocs(
+            client().prepareSearch("first_split").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(),
+            numDocs
+        );
+        assertAllUniqueDocs(client().prepareSearch("source").setSize(100).setQuery(new TermsQueryBuilder("foo", "bar")).get(), numDocs);
+    }
+
+    public void assertNested(String index, int numDocs) {
+        // now, do a nested query
+        SearchResponse searchResponse = client().prepareSearch(index)
+            .setQuery(nestedQuery("nested1", termQuery("nested1.n_field1", "n_value1_1"), ScoreMode.Avg))
+            .get();
+        assertNoFailures(searchResponse);
+        assertThat(searchResponse.getHits().getTotalHits().value, equalTo((long) numDocs));
+    }
+
+    public void assertAllUniqueDocs(SearchResponse response, int numDocs) {
+        Set<String> ids = new HashSet<>();
+        for (int i = 0; i < response.getHits().getHits().length; i++) {
+            String id = response.getHits().getHits()[i].getId();
+            assertTrue("found ID " + id + " more than once", ids.add(id));
+        }
+        assertEquals(numDocs, ids.size());
+    }
+
+    public void testSplitIndexPrimaryTerm() throws Exception {
+        int numberOfTargetShards = randomIntBetween(2, 20);
+        int numberOfShards = randomValueOtherThanMany(n -> numberOfTargetShards % n != 0, () -> between(1, numberOfTargetShards - 1));
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        prepareCreate("source").setSettings(
+            Settings.builder()
+                .put(indexSettings())
+                .put("number_of_shards", numberOfShards)
+                .put("index.number_of_routing_shards", numberOfTargetShards)
+        ).get();
+        ensureGreen(TimeValue.timeValueSeconds(120)); // needs more than the default to allocate many shards
+
+        // fail random primary shards to force primary terms to increase
+        final Index source = resolveIndex("source");
+        final int iterations = scaledRandomIntBetween(0, 16);
+        for (int i = 0; i < iterations; i++) {
+            final String node = randomSubsetOf(1, internalCluster().nodesInclude("source")).get(0);
+            final IndicesService indexServices = internalCluster().getInstance(IndicesService.class, node);
+            final IndexService indexShards = indexServices.indexServiceSafe(source);
+            for (final Integer shardId : indexShards.shardIds()) {
+                final IndexShard shard = indexShards.getShard(shardId);
+                if (shard.routingEntry().primary() && randomBoolean()) {
+                    disableAllocation("source");
+                    shard.failShard("test", new Exception("test"));
+                    // this can not succeed until the shard is failed and a replica is promoted
+                    int id = 0;
+                    while (true) {
+                        // find an ID that routes to the right shard, we will only index to the shard that saw a primary failure
+                        final String s = Integer.toString(id);
+                        final int hash = Math.floorMod(Murmur3HashFunction.hash(s), numberOfShards);
+                        if (hash == shardId) {
+                            final IndexRequest request = new IndexRequest("source").id(s)
+                                .source("{ \"f\": \"" + s + "\"}", MediaTypeRegistry.JSON);
+                            client().index(request).get();
+                            break;
+                        } else {
+                            id++;
+                        }
+                    }
+                    enableAllocation("source");
+                    ensureGreen();
+                }
+            }
+        }
+
+        final Settings.Builder prepareSplitSettings = Settings.builder().put("index.blocks.write", true);
+        client().admin().indices().prepareUpdateSettings("source").setSettings(prepareSplitSettings).get();
+        ensureYellow();
+
+        final IndexMetadata indexMetadata = indexMetadata(client(), "source");
+        final long beforeSplitPrimaryTerm = IntStream.range(0, numberOfShards).mapToLong(indexMetadata::primaryTerm).max().getAsLong();
+
+        // now split source into target
+        final Settings splitSettings = Settings.builder()
+            .put("index.number_of_replicas", 0)
+            .put("index.number_of_shards", numberOfTargetShards)
+            .putNull("index.blocks.write")
+            .build();
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareResizeIndex("source", "target")
+                .setResizeType(ResizeType.SPLIT)
+                .setSettings(splitSettings)
+                .get()
+        );
+
+        ensureGreen(TimeValue.timeValueSeconds(120)); // needs more than the default to relocate many shards
+
+        final IndexMetadata aftersplitIndexMetadata = indexMetadata(client(), "target");
+        for (int shardId = 0; shardId < numberOfTargetShards; shardId++) {
+            assertThat(aftersplitIndexMetadata.primaryTerm(shardId), equalTo(beforeSplitPrimaryTerm + 1));
+        }
+    }
+
+    private static IndexMetadata indexMetadata(final Client client, final String index) {
+        final ClusterStateResponse clusterStateResponse = client.admin().cluster().state(new ClusterStateRequest()).actionGet();
+        return clusterStateResponse.getState().metadata().index(index);
+    }
+
+    public void testCreateSplitIndex() throws Exception {
+        internalCluster().ensureAtLeastNumDataNodes(2);
+        Version version = VersionUtils.randomIndexCompatibleVersion(random());
+        prepareCreate("source").setSettings(
+            Settings.builder().put(indexSettings()).put("number_of_shards", 1).put("index.version.created", version)
+        ).get();
+        final int docs = randomIntBetween(0, 128);
+        for (int i = 0; i < docs; i++) {
+            client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+        }
+        // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
+        // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
+        // to the require._name below.
+        ensureGreen();
+        // relocate all shards to one node such that we can merge it.
+        client().admin().indices().prepareUpdateSettings("source").setSettings(Settings.builder().put("index.blocks.write", true)).get();
+        ensureGreen();
+
+        final IndicesStatsResponse sourceStats = client().admin().indices().prepareStats("source").setSegments(true).get();
+
+        // disable rebalancing to be able to capture the right stats. balancing can move the target primary
+        // making it hard to pin point the source shards.
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none"))
+            .get();
+        try {
+            assertAcked(
+                client().admin()
+                    .indices()
+                    .prepareResizeIndex("source", "target")
+                    .setResizeType(ResizeType.SPLIT)
+                    .setSettings(
+                        Settings.builder()
+                            .put("index.number_of_replicas", 0)
+                            .put("index.number_of_shards", 2)
+                            .putNull("index.blocks.write")
+                            .build()
+                    )
+                    .get()
+            );
+            ensureGreen();
+
+            final ClusterState state = client().admin().cluster().prepareState().get().getState();
+            DiscoveryNode mergeNode = state.nodes().get(state.getRoutingTable().index("target").shard(0).primaryShard().currentNodeId());
+            logger.info("split node {}", mergeNode);
+
+            final long maxSeqNo = Arrays.stream(sourceStats.getShards())
+                .filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId()))
+                .map(ShardStats::getSeqNoStats)
+                .mapToLong(SeqNoStats::getMaxSeqNo)
+                .max()
+                .getAsLong();
+            final long maxUnsafeAutoIdTimestamp = Arrays.stream(sourceStats.getShards())
+                .filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId()))
+                .map(ShardStats::getStats)
+                .map(CommonStats::getSegments)
+                .mapToLong(SegmentsStats::getMaxUnsafeAutoIdTimestamp)
+                .max()
+                .getAsLong();
+
+            final IndicesStatsResponse targetStats = client().admin().indices().prepareStats("target").get();
+            for (final ShardStats shardStats : targetStats.getShards()) {
+                final SeqNoStats seqNoStats = shardStats.getSeqNoStats();
+                final ShardRouting shardRouting = shardStats.getShardRouting();
+                assertThat("failed on " + shardRouting, seqNoStats.getMaxSeqNo(), equalTo(maxSeqNo));
+                assertThat("failed on " + shardRouting, seqNoStats.getLocalCheckpoint(), equalTo(maxSeqNo));
+                assertThat(
+                    "failed on " + shardRouting,
+                    shardStats.getStats().getSegments().getMaxUnsafeAutoIdTimestamp(),
+                    equalTo(maxUnsafeAutoIdTimestamp)
+                );
+            }
+
+            final int size = docs > 0 ? 2 * docs : 1;
+            assertHitCount(client().prepareSearch("target").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+
+            for (int i = docs; i < 2 * docs; i++) {
+                client().prepareIndex("target").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get();
+            }
+            flushAndRefresh();
+            assertHitCount(
+                client().prepareSearch("target").setSize(2 * size).setQuery(new TermsQueryBuilder("foo", "bar")).get(),
+                2 * docs
+            );
+            assertHitCount(client().prepareSearch("source").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
+            GetSettingsResponse target = client().admin().indices().prepareGetSettings("target").get();
+            assertEquals(version, target.getIndexToSettings().get("target").getAsVersion("index.version.created", null));
+        } finally {
+            // clean up
+            client().admin()
+                .cluster()
+                .prepareUpdateSettings()
+                .setTransientSettings(
+                    Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), (String) null)
+                )
+                .get();
+        }
+
+    }
+
+}
diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
index 9e0b2a66467de..ad78c503a4a19 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
@@ -330,6 +330,8 @@ public void testRestoreInSameRemoteStoreEnabledIndex() throws IOException {
         assertEquals(restoreSnapshotResponse1.status(), RestStatus.ACCEPTED);
         assertEquals(restoreSnapshotResponse2.status(), RestStatus.ACCEPTED);
         ensureGreen(indexName1, restoredIndexName2);
+
+        assertRemoteSegmentsAndTranslogUploaded(restoredIndexName2);
         assertDocsPresentInIndex(client, indexName1, numDocsInIndex1);
         assertDocsPresentInIndex(client, restoredIndexName2, numDocsInIndex2);
         // indexing some new docs and validating
@@ -355,6 +357,29 @@ public void testRestoreInSameRemoteStoreEnabledIndex() throws IOException {
         assertDocsPresentInIndex(client, indexName1, numDocsInIndex1 + 4);
     }
 
+    void assertRemoteSegmentsAndTranslogUploaded(String idx) throws IOException {
+        String indexUUID = client().admin().indices().prepareGetSettings(idx).get().getSetting(idx, IndexMetadata.SETTING_INDEX_UUID);
+
+        Path remoteTranslogMetadataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/metadata");
+        Path remoteTranslogDataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/data");
+        Path segmentMetadataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/segments/metadata");
+        Path segmentDataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/segments/data");
+
+        try (
+            Stream<Path> translogMetadata = Files.list(remoteTranslogMetadataPath);
+            Stream<Path> translogData = Files.list(remoteTranslogDataPath);
+            Stream<Path> segmentMetadata = Files.list(segmentMetadataPath);
+            Stream<Path> segmentData = Files.list(segmentDataPath);
+
+        ) {
+            assertTrue(translogData.count() > 0);
+            assertTrue(translogMetadata.count() > 0);
+            assertTrue(segmentMetadata.count() > 0);
+            assertTrue(segmentData.count() > 0);
+        }
+
+    }
+
     public void testRemoteRestoreIndexRestoredFromSnapshot() throws IOException, ExecutionException, InterruptedException {
         internalCluster().startClusterManagerOnlyNode();
         internalCluster().startDataOnlyNodes(2);
@@ -395,23 +420,7 @@ public void testRemoteRestoreIndexRestoredFromSnapshot() throws IOException, Exe
         ensureGreen(indexName1);
         assertDocsPresentInIndex(client(), indexName1, numDocsInIndex1);
 
-        // Make sure remote translog is empty
-        String indexUUID = client().admin()
-            .indices()
-            .prepareGetSettings(indexName1)
-            .get()
-            .getSetting(indexName1, IndexMetadata.SETTING_INDEX_UUID);
-
-        Path remoteTranslogMetadataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/metadata");
-        Path remoteTranslogDataPath = Path.of(String.valueOf(remoteRepoPath), indexUUID, "/0/translog/data");
-
-        try (
-            Stream<Path> translogMetadata = Files.list(remoteTranslogMetadataPath);
-            Stream<Path> translogData = Files.list(remoteTranslogDataPath)
-        ) {
-            assertTrue(translogData.count() > 0);
-            assertTrue(translogMetadata.count() > 0);
-        }
+        assertRemoteSegmentsAndTranslogUploaded(indexName1);
 
         // Clear the local data before stopping the node. This will make sure that remote translog is empty.
         IndexShard indexShard = getIndexShard(primaryNodeName(indexName1), indexName1);
diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java
index bccca283ba772..8b4981a15433a 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java
@@ -56,7 +56,7 @@ public class RemoteStoreBaseIntegTestCase extends OpenSearchIntegTestCase {
     protected static final String REPOSITORY_NAME = "test-remote-store-repo";
     protected static final String REPOSITORY_2_NAME = "test-remote-store-repo-2";
     protected static final int SHARD_COUNT = 1;
-    protected static final int REPLICA_COUNT = 1;
+    protected static int REPLICA_COUNT = 1;
     protected static final String TOTAL_OPERATIONS = "total-operations";
     protected static final String REFRESHED_OR_FLUSHED_OPERATIONS = "refreshed-or-flushed-operations";
     protected static final String MAX_SEQ_NO_TOTAL = "max-seq-no-total";
diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
index 352d4efc95269..32396f1a3df2e 100644
--- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java
+++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
@@ -203,6 +203,7 @@
 import java.nio.file.NoSuchFileException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.HashSet;
@@ -2006,6 +2007,29 @@ private RemoteSegmentStoreDirectory getRemoteDirectory() {
         return ((RemoteSegmentStoreDirectory) remoteDirectory);
     }
 
+    /**
+    Returns true iff it is able to verify that remote segment store
+    is in sync with local
+     */
+    boolean isRemoteSegmentStoreInSync() {
+        assert indexSettings.isRemoteStoreEnabled();
+        try {
+            RemoteSegmentStoreDirectory directory = getRemoteDirectory();
+            if (directory.readLatestMetadataFile() != null) {
+                // verifying that all files except EXCLUDE_FILES are uploaded to the remote
+                Collection<String> uploadFiles = directory.getSegmentsUploadedToRemoteStore().keySet();
+                SegmentInfos segmentInfos = store.readLastCommittedSegmentsInfo();
+                Collection<String> localFiles = segmentInfos.files(true);
+                if (uploadFiles.containsAll(localFiles)) {
+                    return true;
+                }
+            }
+        } catch (IOException e) {
+            logger.error("Exception while reading latest metadata", e);
+        }
+        return false;
+    }
+
     public void preRecovery() {
         final IndexShardState currentState = this.state; // single volatile read
         if (currentState == IndexShardState.CLOSED) {
diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
index 464adc88ae16f..dd40327298874 100644
--- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
+++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java
@@ -20,6 +20,7 @@
 import org.opensearch.action.LatchedActionListener;
 import org.opensearch.action.bulk.BackoffPolicy;
 import org.opensearch.action.support.GroupedActionListener;
+import org.opensearch.cluster.routing.RecoverySource;
 import org.opensearch.common.concurrent.GatedCloseable;
 import org.opensearch.common.logging.Loggers;
 import org.opensearch.common.unit.TimeValue;
@@ -179,6 +180,9 @@ private boolean shouldSync(boolean didRefresh, boolean skipPrimaryTermCheck) {
         return this.primaryTerm != indexShard.getOperationPrimaryTerm();
     }
 
+    /*
+     @return false if retry is needed
+     */
     private boolean syncSegments() {
         if (isReadyForUpload() == false) {
             // Following check is required to enable retry and make sure that we do not lose this refresh event
@@ -485,7 +489,9 @@ private void initializeRemoteDirectoryOnTermUpdate() throws IOException {
      * @return true iff primaryMode is true and index shard is not in closed state.
      */
     private boolean isReadyForUpload() {
-        boolean isReady = indexShard.getReplicationTracker().isPrimaryMode() && indexShard.state() != IndexShardState.CLOSED;
+        boolean isReady = (indexShard.getReplicationTracker().isPrimaryMode() && indexShard.state() != IndexShardState.CLOSED)
+            || isLocalOrSnapshotRecovery();
+
         if (isReady == false) {
             StringBuilder sb = new StringBuilder("Skipped syncing segments with");
             if (indexShard.getReplicationTracker() != null) {
@@ -497,11 +503,25 @@ private boolean isReadyForUpload() {
             if (indexShard.getEngineOrNull() != null) {
                 sb.append(" engineType=").append(indexShard.getEngine().getClass().getSimpleName());
             }
+            if (isLocalOrSnapshotRecovery() == false) {
+                sb.append(" recoverySourceType=").append(indexShard.recoveryState().getRecoverySource().getType());
+                sb.append(" primary=").append(indexShard.shardRouting.primary());
+            }
             logger.trace(sb.toString());
         }
         return isReady;
     }
 
+    private boolean isLocalOrSnapshotRecovery() {
+        // In this case when the primary mode is false, we need to upload segments to Remote Store
+        // This is required in case of snapshots/shrink/ split/clone where we need to durable persist
+        // all segments to remote before completing the recovery to ensure durability.
+
+        return (indexShard.state() == IndexShardState.RECOVERING && indexShard.shardRouting.primary())
+            && (indexShard.recoveryState().getRecoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS
+                || indexShard.recoveryState().getRecoverySource().getType() == RecoverySource.Type.SNAPSHOT);
+    }
+
     /**
      * Creates an {@link UploadListener} containing the stats population logic which would be triggered before and after segment upload events
      */
diff --git a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
index c0211e1257c8e..e823401e5ef7e 100644
--- a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
+++ b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
@@ -191,6 +191,15 @@ void recoverFromLocalShards(
                     // just trigger a merge to do housekeeping on the
                     // copied segments - we will also see them in stats etc.
                     indexShard.getEngine().forceMerge(false, -1, false, false, false, UUIDs.randomBase64UUID());
+                    if (indexShard.isRemoteTranslogEnabled()) {
+                        if (indexShard.isRemoteSegmentStoreInSync() == false) {
+                            throw new IndexShardRecoveryException(
+                                indexShard.shardId(),
+                                "failed to upload to remote",
+                                new IOException("Failed to upload to remote segment store")
+                            );
+                        }
+                    }
                     return true;
                 } catch (IOException ex) {
                     throw new IndexShardRecoveryException(indexShard.shardId(), "failed to recover from local shards", ex);
@@ -418,6 +427,12 @@ void recoverFromSnapshotAndRemoteStore(
                 }
                 indexShard.getEngine().fillSeqNoGaps(indexShard.getPendingPrimaryTerm());
                 indexShard.finalizeRecovery();
+                if (indexShard.isRemoteTranslogEnabled()) {
+                    if (indexShard.isRemoteSegmentStoreInSync() == false) {
+                        listener.onFailure(new IndexShardRestoreFailedException(shardId, "Failed to upload to remote segment store"));
+                        return;
+                    }
+                }
                 indexShard.postRecovery("restore done");
 
                 listener.onResponse(true);
@@ -697,6 +712,12 @@ private void restore(
             }
             indexShard.getEngine().fillSeqNoGaps(indexShard.getPendingPrimaryTerm());
             indexShard.finalizeRecovery();
+            if (indexShard.isRemoteTranslogEnabled()) {
+                if (indexShard.isRemoteSegmentStoreInSync() == false) {
+                    listener.onFailure(new IndexShardRestoreFailedException(shardId, "Failed to upload to remote segment store"));
+                    return;
+                }
+            }
             indexShard.postRecovery("restore done");
             listener.onResponse(true);
         }, e -> listener.onFailure(new IndexShardRestoreFailedException(shardId, "restore failed", e)));
diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
index fa3cf7676f55c..dc2111fdcfc56 100644
--- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
+++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java
@@ -2850,6 +2850,7 @@ public void testSyncSegmentsFromGivenRemoteSegmentStore() throws IOException {
         indexDoc(source, "_doc", "1");
         indexDoc(source, "_doc", "2");
         source.refresh("test");
+        assertTrue("At lease one remote sync should have been completed", source.isRemoteSegmentStoreInSync());
         assertDocs(source, "1", "2");
         indexDoc(source, "_doc", "3");
         source.refresh("test");

From f372cbf89377036c67143360f903322105ceafe2 Mon Sep 17 00:00:00 2001
From: Aman Khare <85096200+amkhar@users.noreply.github.com>
Date: Sat, 28 Oct 2023 23:24:03 +0530
Subject: [PATCH 27/33] Add statsName field on stream while constructing
 PersistedStateStats (#10964)

Signed-off-by: Aman Khare <amkhar@amazon.com>
Co-authored-by: Aman Khare <amkhar@amazon.com>
---
 .../remote/RemoteClusterStateServiceIT.java   | 70 ++++++++++++++++---
 .../coordination/PersistedStateStats.java     |  8 ++-
 .../cluster/node/stats/NodeStatsTests.java    |  1 +
 3 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
index 59eef3c06844b..dcf695d5366ba 100644
--- a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
@@ -114,6 +114,31 @@ public void testRemoteStateStats() {
             .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
             .get();
 
+        // assert cluster state stats
+        assertClusterManagerClusterStateStats(nodesStatsResponse);
+
+        NodesStatsResponse nodesStatsResponseDataNode = client().admin()
+            .cluster()
+            .prepareNodesStats(dataNode)
+            .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+            .get();
+        // assert cluster state stats for data node
+        DiscoveryStats dataNodeDiscoveryStats = nodesStatsResponseDataNode.getNodes().get(0).getDiscoveryStats();
+        assertNotNull(dataNodeDiscoveryStats.getClusterStateStats());
+        assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getUpdateSuccess());
+
+        // call nodes/stats with nodeId filter
+        NodesStatsResponse nodesStatsNodeIdFilterResponse = client().admin()
+            .cluster()
+            .prepareNodesStats(dataNode)
+            .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+            .setNodesIds(clusterManagerNode)
+            .get();
+
+        assertClusterManagerClusterStateStats(nodesStatsNodeIdFilterResponse);
+    }
+
+    private void assertClusterManagerClusterStateStats(NodesStatsResponse nodesStatsResponse) {
         // assert cluster state stats
         DiscoveryStats discoveryStats = nodesStatsResponse.getNodes().get(0).getDiscoveryStats();
 
@@ -125,16 +150,43 @@ public void testRemoteStateStats() {
         assertTrue(discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getSuccessCount() > 1);
         assertEquals(0, discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getFailedCount());
         assertTrue(discoveryStats.getClusterStateStats().getPersistenceStats().get(0).getTotalTimeInMillis() > 0);
+    }
 
-        NodesStatsResponse nodesStatsResponseDataNode = client().admin()
-            .cluster()
-            .prepareNodesStats(dataNode)
-            .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
-            .get();
-        // assert cluster state stats for data node
-        DiscoveryStats dataNodeDiscoveryStats = nodesStatsResponseDataNode.getNodes().get(0).getDiscoveryStats();
-        assertNotNull(dataNodeDiscoveryStats.getClusterStateStats());
-        assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getUpdateSuccess());
+    public void testRemoteStateStatsFromAllNodes() {
+        int shardCount = randomIntBetween(1, 5);
+        int replicaCount = 1;
+        int dataNodeCount = shardCount * (replicaCount + 1);
+        int clusterManagerNodeCount = 3;
+        prepareCluster(clusterManagerNodeCount, dataNodeCount, INDEX_NAME, replicaCount, shardCount);
+        String[] allNodes = internalCluster().getNodeNames();
+        // call _nodes/stats/discovery from all the nodes
+        for (String node : allNodes) {
+            NodesStatsResponse nodesStatsResponse = client().admin()
+                .cluster()
+                .prepareNodesStats(node)
+                .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+                .get();
+            validateNodesStatsResponse(nodesStatsResponse);
+        }
+
+        // call _nodes/stats/discovery from all the nodes with random nodeId filter
+        for (String node : allNodes) {
+            NodesStatsResponse nodesStatsResponse = client().admin()
+                .cluster()
+                .prepareNodesStats(node)
+                .addMetric(NodesStatsRequest.Metric.DISCOVERY.metricName())
+                .setNodesIds(allNodes[randomIntBetween(0, allNodes.length - 1)])
+                .get();
+            validateNodesStatsResponse(nodesStatsResponse);
+        }
+    }
+
+    private void validateNodesStatsResponse(NodesStatsResponse nodesStatsResponse) {
+        // _nodes/stats/discovery must never fail due to any exception
+        assertFalse(nodesStatsResponse.toString().contains("exception"));
+        assertNotNull(nodesStatsResponse.getNodes());
+        assertNotNull(nodesStatsResponse.getNodes().get(0));
+        assertNotNull(nodesStatsResponse.getNodes().get(0).getDiscoveryStats());
     }
 
     private void setReplicaCount(int replicaCount) {
diff --git a/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java b/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
index 1dc20e564ade2..4d466c4b3ad73 100644
--- a/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
+++ b/server/src/main/java/org/opensearch/cluster/coordination/PersistedStateStats.java
@@ -25,7 +25,7 @@
  * @opensearch.internal
  */
 public class PersistedStateStats implements Writeable, ToXContentObject {
-    private String statsName;
+    private final String statsName;
     private AtomicLong totalTimeInMillis = new AtomicLong(0);
     private AtomicLong failedCount = new AtomicLong(0);
     private AtomicLong successCount = new AtomicLong(0);
@@ -37,6 +37,7 @@ public PersistedStateStats(String statsName) {
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
+        out.writeString(statsName);
         out.writeVLong(successCount.get());
         out.writeVLong(failedCount.get());
         out.writeVLong(totalTimeInMillis.get());
@@ -53,6 +54,7 @@ public void writeTo(StreamOutput out) throws IOException {
     }
 
     public PersistedStateStats(StreamInput in) throws IOException {
+        this.statsName = in.readString();
         this.successCount = new AtomicLong(in.readVLong());
         this.failedCount = new AtomicLong(in.readVLong());
         this.totalTimeInMillis = new AtomicLong(in.readVLong());
@@ -113,6 +115,10 @@ protected void addToExtendedFields(String extendedField, AtomicLong extendedFiel
         this.extendedFields.put(extendedField, extendedFieldValue);
     }
 
+    public String getStatsName() {
+        return statsName;
+    }
+
     /**
      * Fields for parsing and toXContent
      *
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
index 3050d1674a95b..80f4ebf5d737a 100644
--- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
+++ b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
@@ -368,6 +368,7 @@ public void testSerialization() throws IOException {
                             .getPersistenceStats()
                             .get(0);
                         PersistedStateStats remoteStateStats = stateStats.getPersistenceStats().get(0);
+                        assertEquals(remoteStateStats.getStatsName(), deserializedRemoteStateStats.getStatsName());
                         assertEquals(remoteStateStats.getFailedCount(), deserializedRemoteStateStats.getFailedCount());
                         assertEquals(remoteStateStats.getSuccessCount(), deserializedRemoteStateStats.getSuccessCount());
                         assertEquals(remoteStateStats.getTotalTimeInMillis(), deserializedRemoteStateStats.getTotalTimeInMillis());

From 73bbeb57a8a56de0d1f9cf69e711a8a0ff26afe5 Mon Sep 17 00:00:00 2001
From: Varun Bansal <bansvaru@amazon.com>
Date: Sun, 29 Oct 2023 10:20:44 +0530
Subject: [PATCH 28/33] Restore ClusterState version during remote state
 restore (#10853)

* Restore ClusterState version during remote state restore

Signed-off-by: bansvaru <bansvaru@amazon.com>
---
 CHANGELOG.md                                  |  1 +
 .../remote/RemoteClusterStateServiceIT.java   |  4 +-
 .../RemoteStoreClusterStateRestoreIT.java     | 53 +++++++++++++++++--
 .../remote/RemoteClusterStateService.java     | 11 ++--
 .../recovery/RemoteStoreRestoreService.java   | 17 +++---
 .../RemoteClusterStateServiceTests.java       | 34 ++++++++----
 6 files changed, 93 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 34fd573b295b3..020fb5bda8b8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Remote cluster state] Restore global metadata from remote store when local state is lost after quorum loss ([#10404](https://github.com/opensearch-project/OpenSearch/pull/10404))
 - [AdmissionControl] Added changes for AdmissionControl Interceptor and AdmissionControlService for RateLimiting ([#9286](https://github.com/opensearch-project/OpenSearch/pull/9286))
 - GHA to verify checklist items completion in PR descriptions ([#10800](https://github.com/opensearch-project/OpenSearch/pull/10800))
+- [Remote cluster state] Restore cluster state version during remote state auto restore ([#10853](https://github.com/opensearch-project/OpenSearch/pull/10853))
 
 ### Dependencies
 - Bump `log4j-core` from 2.18.0 to 2.19.0
diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
index dcf695d5366ba..dfde1b958882c 100644
--- a/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteClusterStateServiceIT.java
@@ -90,10 +90,10 @@ public void testFullClusterRestoreStaleDelete() throws Exception {
 
         assertEquals(10, repository.blobStore().blobContainer(baseMetadataPath.add("manifest")).listBlobsByPrefix("manifest").size());
 
-        Map<String, IndexMetadata> indexMetadataMap = remoteClusterStateService.getLatestMetadata(
+        Map<String, IndexMetadata> indexMetadataMap = remoteClusterStateService.getLatestClusterState(
             cluster().getClusterName(),
             getClusterState().metadata().clusterUUID()
-        ).getIndices();
+        ).getMetadata().getIndices();
         assertEquals(0, indexMetadataMap.values().stream().findFirst().get().getNumberOfReplicas());
         assertEquals(shardCount, indexMetadataMap.values().stream().findFirst().get().getNumberOfShards());
     }
diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java
index e9afd6d36bb87..c61e2ec6e4f6c 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java
@@ -30,6 +30,7 @@
 import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ExecutionException;
@@ -85,6 +86,7 @@ public void testFullClusterRestore() throws Exception {
         // Step - 1 index some data to generate files in remote directory
         Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, 1);
         String prevClusterUUID = clusterService().state().metadata().clusterUUID();
+        long prevClusterStateVersion = clusterService().state().version();
 
         // Step - 2 Replace all nodes in the cluster with new nodes. This ensures new cluster state doesn't have previous index metadata
         resetCluster(dataNodeCount, clusterManagerNodeCount);
@@ -92,9 +94,17 @@ public void testFullClusterRestore() throws Exception {
         String newClusterUUID = clusterService().state().metadata().clusterUUID();
         assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";
 
-        // Step - 3 Trigger full cluster restore and validate
+        // Step - 3 validate cluster state restored
+        long newClusterStateVersion = clusterService().state().version();
+        assert prevClusterStateVersion < newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is greater than current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
         validateMetadata(List.of(INDEX_NAME));
         verifyRedIndicesAndTriggerRestore(indexStats, INDEX_NAME, true);
+
     }
 
     /**
@@ -121,6 +131,7 @@ public void testFullClusterRestoreDoesntFailWithConflictingLocalState() throws E
         // index some data to generate files in remote directory
         Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, 1);
         String prevClusterUUID = clusterService().state().metadata().clusterUUID();
+        long prevClusterStateVersion = clusterService().state().version();
 
         // stop all nodes
         internalCluster().stopAllNodes();
@@ -156,6 +167,14 @@ public Settings onNodeStopped(String nodeName) {
         newClusterUUID = clusterService().state().metadata().clusterUUID();
         assert !Objects.equals(newClusterUUID, ClusterState.UNKNOWN_UUID) : "cluster restart not successful. cluster uuid is still unknown";
         assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";
+
+        long newClusterStateVersion = clusterService().state().version();
+        assert prevClusterStateVersion < newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is greater than current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
         validateMetadata(List.of(INDEX_NAME));
 
         // start data nodes to trigger index data recovery
@@ -180,6 +199,7 @@ public void testFullClusterRestoreMultipleIndices() throws Exception {
         updateIndexBlock(true, secondIndexName);
 
         String prevClusterUUID = clusterService().state().metadata().clusterUUID();
+        long prevClusterStateVersion = clusterService().state().version();
 
         // Step - 2 Replace all nodes in the cluster with new nodes. This ensures new cluster state doesn't have previous index metadata
         resetCluster(dataNodeCount, clusterManagerNodeCount);
@@ -187,7 +207,14 @@ public void testFullClusterRestoreMultipleIndices() throws Exception {
         String newClusterUUID = clusterService().state().metadata().clusterUUID();
         assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";
 
-        // Step - 3 Trigger full cluster restore
+        // Step - 3 validate cluster state restored
+        long newClusterStateVersion = clusterService().state().version();
+        assert prevClusterStateVersion < newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is greater than current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
         validateMetadata(List.of(INDEX_NAME, secondIndexName));
         verifyRedIndicesAndTriggerRestore(indexStats, INDEX_NAME, false);
         verifyRedIndicesAndTriggerRestore(indexStats2, secondIndexName, false);
@@ -239,6 +266,7 @@ public void testRemoteStateFullRestart() throws Exception {
 
         Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, clusterManagerNodeCount);
         String prevClusterUUID = clusterService().state().metadata().clusterUUID();
+        long prevClusterStateVersion = clusterService().state().version();
         // Delete index metadata file in remote
         try {
             Files.move(
@@ -257,6 +285,14 @@ public void testRemoteStateFullRestart() throws Exception {
         ensureGreen(INDEX_NAME);
         String newClusterUUID = clusterService().state().metadata().clusterUUID();
         assert Objects.equals(newClusterUUID, prevClusterUUID) : "Full restart not successful. cluster uuid has changed";
+
+        long newClusterStateVersion = clusterService().state().version();
+        assert prevClusterStateVersion < newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is greater than current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
         validateCurrentMetadata();
         verifyRedIndicesAndTriggerRestore(indexStats, INDEX_NAME, true);
     }
@@ -309,6 +345,7 @@ public void testFullClusterRestoreGlobalMetadata() throws Exception {
         // Step - 1 index some data to generate files in remote directory
         Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, 1);
         String prevClusterUUID = clusterService().state().metadata().clusterUUID();
+        long prevClusterStateVersion = clusterService().state().version();
 
         // Create global metadata - register a custom repo
         Path repoPath = registerCustomRepository();
@@ -328,8 +365,16 @@ public void testFullClusterRestoreGlobalMetadata() throws Exception {
         String newClusterUUID = clusterService().state().metadata().clusterUUID();
         assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";
 
-        // Step - 3 Trigger full cluster restore and validate
-        // validateCurrentMetadata();
+        // Step - 3 validate cluster state restored
+        long newClusterStateVersion = clusterService().state().version();
+        assert prevClusterStateVersion < newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is greater than current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
+
+        validateCurrentMetadata();
         assertEquals(Integer.valueOf(34), SETTING_CLUSTER_MAX_SHARDS_PER_NODE.get(clusterService().state().metadata().settings()));
         assertEquals(true, SETTING_READ_ONLY_SETTING.get(clusterService().state().metadata().settings()));
         assertTrue(clusterService().state().blocks().hasGlobalBlock(CLUSTER_READ_ONLY_BLOCK));
diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index b3309b1fd8a63..205ae12cf6214 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -767,16 +767,16 @@ private IndexMetadata getIndexMetadata(String clusterName, String clusterUUID, U
     }
 
     /**
-     * Fetch latest metadata from remote cluster state including global metadata and index metadata
+     * Fetch latest ClusterState from remote, including global metadata, index metadata and cluster state version
      *
      * @param clusterUUID uuid of cluster state to refer to in remote
      * @param clusterName name of the cluster
      * @return {@link IndexMetadata}
      */
-    public Metadata getLatestMetadata(String clusterName, String clusterUUID) {
+    public ClusterState getLatestClusterState(String clusterName, String clusterUUID) {
         start();
         Optional<ClusterMetadataManifest> clusterMetadataManifest = getLatestClusterMetadataManifest(clusterName, clusterUUID);
-        if (!clusterMetadataManifest.isPresent()) {
+        if (clusterMetadataManifest.isEmpty()) {
             throw new IllegalStateException(
                 String.format(Locale.ROOT, "Latest cluster metadata manifest is not present for the provided clusterUUID: %s", clusterUUID)
             );
@@ -790,7 +790,10 @@ public Metadata getLatestMetadata(String clusterName, String clusterUUID) {
         Map<String, IndexMetadata> indexMetadataMap = new HashMap<>();
         indices.values().forEach(indexMetadata -> { indexMetadataMap.put(indexMetadata.getIndex().getName(), indexMetadata); });
 
-        return Metadata.builder(globalMetadata).indices(indexMetadataMap).build();
+        return ClusterState.builder(ClusterState.EMPTY_STATE)
+            .version(clusterMetadataManifest.get().getStateVersion())
+            .metadata(Metadata.builder(globalMetadata).indices(indexMetadataMap).build())
+            .build();
     }
 
     private Metadata getGlobalMetadata(String clusterName, String clusterUUID, ClusterMetadataManifest clusterMetadataManifest) {
diff --git a/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java b/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
index aebd7d2ea201a..23bb4cea17a20 100644
--- a/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
+++ b/server/src/main/java/org/opensearch/index/recovery/RemoteStoreRestoreService.java
@@ -138,7 +138,7 @@ public RemoteRestoreResult restore(
         String[] indexNames
     ) {
         Map<String, Tuple<Boolean, IndexMetadata>> indexMetadataMap = new HashMap<>();
-        Metadata remoteMetadata = null;
+        ClusterState remoteState = null;
         boolean metadataFromRemoteStore = (restoreClusterUUID == null
             || restoreClusterUUID.isEmpty()
             || restoreClusterUUID.isBlank()) == false;
@@ -150,8 +150,8 @@ public RemoteRestoreResult restore(
                     throw new IllegalArgumentException("clusterUUID to restore from should be different from current cluster UUID");
                 }
                 logger.info("Restoring cluster state from remote store from cluster UUID : [{}]", restoreClusterUUID);
-                remoteMetadata = remoteClusterStateService.getLatestMetadata(currentState.getClusterName().value(), restoreClusterUUID);
-                remoteMetadata.getIndices().values().forEach(indexMetadata -> {
+                remoteState = remoteClusterStateService.getLatestClusterState(currentState.getClusterName().value(), restoreClusterUUID);
+                remoteState.getMetadata().getIndices().values().forEach(indexMetadata -> {
                     indexMetadataMap.put(indexMetadata.getIndex().getName(), new Tuple<>(true, indexMetadata));
                 });
             } catch (Exception e) {
@@ -177,7 +177,7 @@ public RemoteRestoreResult restore(
                 }
             }
         }
-        return executeRestore(currentState, indexMetadataMap, restoreAllShards, remoteMetadata);
+        return executeRestore(currentState, indexMetadataMap, restoreAllShards, remoteState);
     }
 
     /**
@@ -191,7 +191,7 @@ private RemoteRestoreResult executeRestore(
         ClusterState currentState,
         Map<String, Tuple<Boolean, IndexMetadata>> indexMetadataMap,
         boolean restoreAllShards,
-        Metadata remoteMetadata
+        ClusterState remoteState
     ) {
         final String restoreUUID = UUIDs.randomBase64UUID();
         List<String> indicesToBeRestored = new ArrayList<>();
@@ -241,8 +241,11 @@ private RemoteRestoreResult executeRestore(
             totalShards += updatedIndexMetadata.getNumberOfShards();
         }
 
-        if (remoteMetadata != null) {
-            restoreGlobalMetadata(mdBuilder, remoteMetadata);
+        if (remoteState != null) {
+            restoreGlobalMetadata(mdBuilder, remoteState.getMetadata());
+            // Restore ClusterState version
+            logger.info("Restoring ClusterState with Remote State version [{}]", remoteState.version());
+            builder.version(remoteState.version());
         }
 
         RestoreInfo restoreInfo = new RestoreInfo("remote_store", indicesToBeRestored, totalShards, totalShards);
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index 586618bd1ecff..4efd1b8a62970 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -665,7 +665,8 @@ public void testReadLatestMetadataManifestSuccessButNoIndexMetadata() throws IOE
 
         remoteClusterStateService.start();
         assertEquals(
-            remoteClusterStateService.getLatestMetadata(clusterState.getClusterName().value(), clusterState.metadata().clusterUUID())
+            remoteClusterStateService.getLatestClusterState(clusterState.getClusterName().value(), clusterState.metadata().clusterUUID())
+                .getMetadata()
                 .getIndices()
                 .size(),
             0
@@ -694,8 +695,10 @@ public void testReadLatestMetadataManifestSuccessButIndexMetadataFetchIOExceptio
         remoteClusterStateService.start();
         Exception e = assertThrows(
             IllegalStateException.class,
-            () -> remoteClusterStateService.getLatestMetadata(clusterState.getClusterName().value(), clusterState.metadata().clusterUUID())
-                .getIndices()
+            () -> remoteClusterStateService.getLatestClusterState(
+                clusterState.getClusterName().value(),
+                clusterState.metadata().clusterUUID()
+            ).getMetadata().getIndices()
         );
         assertEquals(e.getMessage(), "Error while downloading IndexMetadata - " + uploadedIndexMetadata.getUploadedFilename());
     }
@@ -740,10 +743,11 @@ public void testReadGlobalMetadata() throws IOException {
         final ClusterState clusterState = generateClusterStateWithGlobalMetadata().nodes(nodesWithLocalNodeClusterManager()).build();
         remoteClusterStateService.start();
 
+        long prevClusterStateVersion = 13L;
         final ClusterMetadataManifest expectedManifest = ClusterMetadataManifest.builder()
             .indices(List.of())
             .clusterTerm(1L)
-            .stateVersion(1L)
+            .stateVersion(prevClusterStateVersion)
             .stateUUID("state-uuid")
             .clusterUUID("cluster-uuid")
             .codecVersion(MANIFEST_CURRENT_CODEC_VERSION)
@@ -756,12 +760,20 @@ public void testReadGlobalMetadata() throws IOException {
         Metadata expactedMetadata = Metadata.builder().persistentSettings(Settings.builder().put("readonly", true).build()).build();
         mockBlobContainerForGlobalMetadata(mockBlobStoreObjects(), expectedManifest, expactedMetadata);
 
-        Metadata metadata = remoteClusterStateService.getLatestMetadata(
+        ClusterState newClusterState = remoteClusterStateService.getLatestClusterState(
             clusterState.getClusterName().value(),
             clusterState.metadata().clusterUUID()
         );
 
-        assertTrue(Metadata.isGlobalStateEquals(metadata, expactedMetadata));
+        assertTrue(Metadata.isGlobalStateEquals(newClusterState.getMetadata(), expactedMetadata));
+
+        long newClusterStateVersion = newClusterState.getVersion();
+        assert prevClusterStateVersion == newClusterStateVersion : String.format(
+            Locale.ROOT,
+            "ClusterState version is not restored. previousClusterVersion: [%s] is not equal to current [%s]",
+            prevClusterStateVersion,
+            newClusterStateVersion
+        );
     }
 
     public void testReadGlobalMetadataIOException() throws IOException {
@@ -793,7 +805,10 @@ public void testReadGlobalMetadataIOException() throws IOException {
         remoteClusterStateService.start();
         Exception e = assertThrows(
             IllegalStateException.class,
-            () -> remoteClusterStateService.getLatestMetadata(clusterState.getClusterName().value(), clusterState.metadata().clusterUUID())
+            () -> remoteClusterStateService.getLatestClusterState(
+                clusterState.getClusterName().value(),
+                clusterState.metadata().clusterUUID()
+            )
         );
         assertEquals(e.getMessage(), "Error while downloading Global Metadata - " + globalIndexMetadataName);
     }
@@ -824,16 +839,15 @@ public void testReadLatestIndexMetadataSuccess() throws IOException {
             .nodeId("nodeA")
             .opensearchVersion(VersionUtils.randomOpenSearchVersion(random()))
             .previousClusterUUID("prev-cluster-uuid")
-            .globalMetadataFileName("global-metadata-file")
             .codecVersion(ClusterMetadataManifest.CODEC_V0)
             .build();
 
         mockBlobContainer(mockBlobStoreObjects(), expectedManifest, Map.of(index.getUUID(), indexMetadata));
 
-        Map<String, IndexMetadata> indexMetadataMap = remoteClusterStateService.getLatestMetadata(
+        Map<String, IndexMetadata> indexMetadataMap = remoteClusterStateService.getLatestClusterState(
             clusterState.getClusterName().value(),
             clusterState.metadata().clusterUUID()
-        ).getIndices();
+        ).getMetadata().getIndices();
 
         assertEquals(indexMetadataMap.size(), 1);
         assertEquals(indexMetadataMap.get(index.getName()).getIndex().getName(), index.getName());

From 84be8c9207cf1153b2eb8dfaf77cf737959781cc Mon Sep 17 00:00:00 2001
From: Dhwanil Patel <dhwanip@amazon.com>
Date: Mon, 30 Oct 2023 12:55:01 +0530
Subject: [PATCH 29/33] Use async write for manifest file and use latch for
 timeout (#10968)

* Use async write for manifest file and use latch for timeout

Signed-off-by: Dhwanil Patel <dhwanip@amazon.com>
---
 .../common/settings/ClusterSettings.java      |  1 +
 .../remote/RemoteClusterStateService.java     | 91 +++++++++++++------
 .../RemoteClusterStateServiceTests.java       | 84 ++++++++++++++---
 3 files changed, 137 insertions(+), 39 deletions(-)

diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
index c2c6effc3336f..3a1fff21db366 100644
--- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
@@ -684,6 +684,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                 RemoteClusterStateService.REMOTE_CLUSTER_STATE_ENABLED_SETTING,
                 RemoteClusterStateService.INDEX_METADATA_UPLOAD_TIMEOUT_SETTING,
                 RemoteClusterStateService.GLOBAL_METADATA_UPLOAD_TIMEOUT_SETTING,
+                RemoteClusterStateService.METADATA_MANIFEST_UPLOAD_TIMEOUT_SETTING,
                 RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING,
                 IndicesService.CLUSTER_REMOTE_TRANSLOG_BUFFER_INTERVAL_SETTING,
                 IndicesService.CLUSTER_REMOTE_INDEX_RESTRICT_ASYNC_DURABILITY_SETTING,
diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
index 205ae12cf6214..c892b475d71da 100644
--- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
+++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java
@@ -87,6 +87,8 @@ public class RemoteClusterStateService implements Closeable {
 
     public static final TimeValue GLOBAL_METADATA_UPLOAD_TIMEOUT_DEFAULT = TimeValue.timeValueMillis(20000);
 
+    public static final TimeValue METADATA_MANIFEST_UPLOAD_TIMEOUT_DEFAULT = TimeValue.timeValueMillis(20000);
+
     public static final Setting<TimeValue> INDEX_METADATA_UPLOAD_TIMEOUT_SETTING = Setting.timeSetting(
         "cluster.remote_store.state.index_metadata.upload_timeout",
         INDEX_METADATA_UPLOAD_TIMEOUT_DEFAULT,
@@ -101,6 +103,13 @@ public class RemoteClusterStateService implements Closeable {
         Setting.Property.NodeScope
     );
 
+    public static final Setting<TimeValue> METADATA_MANIFEST_UPLOAD_TIMEOUT_SETTING = Setting.timeSetting(
+        "cluster.remote_store.state.metadata_manifest.upload_timeout",
+        METADATA_MANIFEST_UPLOAD_TIMEOUT_DEFAULT,
+        Setting.Property.Dynamic,
+        Setting.Property.NodeScope
+    );
+
     public static final ChecksumBlobStoreFormat<IndexMetadata> INDEX_METADATA_FORMAT = new ChecksumBlobStoreFormat<>(
         "index-metadata",
         METADATA_NAME_FORMAT,
@@ -157,6 +166,7 @@ public class RemoteClusterStateService implements Closeable {
 
     private volatile TimeValue indexMetadataUploadTimeout;
     private volatile TimeValue globalMetadataUploadTimeout;
+    private volatile TimeValue metadataManifestUploadTimeout;
 
     private final AtomicBoolean deleteStaleMetadataRunning = new AtomicBoolean(false);
     private final RemotePersistenceStats remoteStateStats;
@@ -190,9 +200,11 @@ public RemoteClusterStateService(
         this.slowWriteLoggingThreshold = clusterSettings.get(SLOW_WRITE_LOGGING_THRESHOLD);
         this.indexMetadataUploadTimeout = clusterSettings.get(INDEX_METADATA_UPLOAD_TIMEOUT_SETTING);
         this.globalMetadataUploadTimeout = clusterSettings.get(GLOBAL_METADATA_UPLOAD_TIMEOUT_SETTING);
+        this.metadataManifestUploadTimeout = clusterSettings.get(METADATA_MANIFEST_UPLOAD_TIMEOUT_SETTING);
         clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
         clusterSettings.addSettingsUpdateConsumer(INDEX_METADATA_UPLOAD_TIMEOUT_SETTING, this::setIndexMetadataUploadTimeout);
         clusterSettings.addSettingsUpdateConsumer(GLOBAL_METADATA_UPLOAD_TIMEOUT_SETTING, this::setGlobalMetadataUploadTimeout);
+        clusterSettings.addSettingsUpdateConsumer(METADATA_MANIFEST_UPLOAD_TIMEOUT_SETTING, this::setMetadataManifestUploadTimeout);
         this.remoteStateStats = new RemotePersistenceStats();
     }
 
@@ -401,13 +413,13 @@ private String writeGlobalMetadata(ClusterState clusterState) throws IOException
         try {
             if (latch.await(getGlobalMetadataUploadTimeout().millis(), TimeUnit.MILLISECONDS) == false) {
                 // TODO: We should add metrics where transfer is timing out. [Issue: #10687]
-                GlobalMetadataTransferException ex = new GlobalMetadataTransferException(
+                RemoteStateTransferException ex = new RemoteStateTransferException(
                     String.format(Locale.ROOT, "Timed out waiting for transfer of global metadata to complete")
                 );
                 throw ex;
             }
         } catch (InterruptedException ex) {
-            GlobalMetadataTransferException exception = new GlobalMetadataTransferException(
+            RemoteStateTransferException exception = new RemoteStateTransferException(
                 String.format(Locale.ROOT, "Timed out waiting for transfer of global metadata to complete - %s"),
                 ex
             );
@@ -415,7 +427,7 @@ private String writeGlobalMetadata(ClusterState clusterState) throws IOException
             throw exception;
         }
         if (exceptionReference.get() != null) {
-            throw new GlobalMetadataTransferException(exceptionReference.get().getMessage(), exceptionReference.get());
+            throw new RemoteStateTransferException(exceptionReference.get().getMessage(), exceptionReference.get());
         }
         return result.get();
     }
@@ -440,7 +452,7 @@ private List<UploadedIndexMetadata> writeIndexMetadataParallel(ClusterState clus
                 );
                 result.add(uploadedIndexMetadata);
             }, ex -> {
-                assert ex instanceof IndexMetadataTransferException;
+                assert ex instanceof RemoteStateTransferException;
                 logger.error(
                     () -> new ParameterizedMessage("Exception during transfer of IndexMetadata to Remote {}", ex.getMessage()),
                     ex
@@ -457,7 +469,7 @@ private List<UploadedIndexMetadata> writeIndexMetadataParallel(ClusterState clus
 
         try {
             if (latch.await(getIndexMetadataUploadTimeout().millis(), TimeUnit.MILLISECONDS) == false) {
-                IndexMetadataTransferException ex = new IndexMetadataTransferException(
+                RemoteStateTransferException ex = new RemoteStateTransferException(
                     String.format(
                         Locale.ROOT,
                         "Timed out waiting for transfer of index metadata to complete - %s",
@@ -469,7 +481,7 @@ private List<UploadedIndexMetadata> writeIndexMetadataParallel(ClusterState clus
             }
         } catch (InterruptedException ex) {
             exceptionList.forEach(ex::addSuppressed);
-            IndexMetadataTransferException exception = new IndexMetadataTransferException(
+            RemoteStateTransferException exception = new RemoteStateTransferException(
                 String.format(
                     Locale.ROOT,
                     "Timed out waiting for transfer of index metadata to complete - %s",
@@ -481,7 +493,7 @@ private List<UploadedIndexMetadata> writeIndexMetadataParallel(ClusterState clus
             throw exception;
         }
         if (exceptionList.size() > 0) {
-            IndexMetadataTransferException exception = new IndexMetadataTransferException(
+            RemoteStateTransferException exception = new RemoteStateTransferException(
                 String.format(
                     Locale.ROOT,
                     "Exception during transfer of IndexMetadata to Remote %s",
@@ -520,7 +532,7 @@ private void writeIndexMetadataAsync(
                     indexMetadataContainer.path().buildAsString() + indexMetadataFilename
                 )
             ),
-            ex -> latchedActionListener.onFailure(new IndexMetadataTransferException(indexMetadata.getIndex().toString(), ex))
+            ex -> latchedActionListener.onFailure(new RemoteStateTransferException(indexMetadata.getIndex().toString(), ex))
         );
 
         INDEX_METADATA_FORMAT.writeAsyncWithUrgentPriority(
@@ -601,14 +613,45 @@ private ClusterMetadataManifest uploadManifest(
 
     private void writeMetadataManifest(String clusterName, String clusterUUID, ClusterMetadataManifest uploadManifest, String fileName)
         throws IOException {
+        AtomicReference<String> result = new AtomicReference<String>();
+        AtomicReference<Exception> exceptionReference = new AtomicReference<Exception>();
+
         final BlobContainer metadataManifestContainer = manifestContainer(clusterName, clusterUUID);
-        CLUSTER_METADATA_MANIFEST_FORMAT.write(
+
+        // latch to wait until upload is not finished
+        CountDownLatch latch = new CountDownLatch(1);
+
+        LatchedActionListener completionListener = new LatchedActionListener<>(ActionListener.wrap(resp -> {
+            logger.trace(String.format(Locale.ROOT, "Manifest file uploaded successfully."));
+        }, ex -> { exceptionReference.set(ex); }), latch);
+
+        CLUSTER_METADATA_MANIFEST_FORMAT.writeAsyncWithUrgentPriority(
             uploadManifest,
             metadataManifestContainer,
             fileName,
             blobStoreRepository.getCompressor(),
+            completionListener,
             FORMAT_PARAMS
         );
+
+        try {
+            if (latch.await(getMetadataManifestUploadTimeout().millis(), TimeUnit.MILLISECONDS) == false) {
+                RemoteStateTransferException ex = new RemoteStateTransferException(
+                    String.format(Locale.ROOT, "Timed out waiting for transfer of manifest file to complete")
+                );
+                throw ex;
+            }
+        } catch (InterruptedException ex) {
+            RemoteStateTransferException exception = new RemoteStateTransferException(
+                String.format(Locale.ROOT, "Timed out waiting for transfer of manifest file to complete - %s"),
+                ex
+            );
+            Thread.currentThread().interrupt();
+            throw exception;
+        }
+        if (exceptionReference.get() != null) {
+            throw new RemoteStateTransferException(exceptionReference.get().getMessage(), exceptionReference.get());
+        }
         logger.debug(
             "Metadata manifest file [{}] written during [{}] phase. ",
             fileName,
@@ -668,6 +711,10 @@ private void setGlobalMetadataUploadTimeout(TimeValue newGlobalMetadataUploadTim
         this.globalMetadataUploadTimeout = newGlobalMetadataUploadTimeout;
     }
 
+    private void setMetadataManifestUploadTimeout(TimeValue newMetadataManifestUploadTimeout) {
+        this.metadataManifestUploadTimeout = newMetadataManifestUploadTimeout;
+    }
+
     public TimeValue getIndexMetadataUploadTimeout() {
         return this.indexMetadataUploadTimeout;
     }
@@ -676,6 +723,10 @@ public TimeValue getGlobalMetadataUploadTimeout() {
         return this.globalMetadataUploadTimeout;
     }
 
+    public TimeValue getMetadataManifestUploadTimeout() {
+        return this.metadataManifestUploadTimeout;
+    }
+
     static String getManifestFileName(long term, long version, boolean committed) {
         // 123456789012_test-cluster/cluster-state/dsgYj10Nkso7/manifest/manifest__<inverted_term>__<inverted_version>__C/P__<inverted__timestamp>__<codec_version>
         return String.join(
@@ -1088,29 +1139,15 @@ public void writeMetadataFailed() {
     }
 
     /**
-     * Exception for IndexMetadata transfer failures to remote
-     */
-    static class IndexMetadataTransferException extends RuntimeException {
-
-        public IndexMetadataTransferException(String errorDesc) {
-            super(errorDesc);
-        }
-
-        public IndexMetadataTransferException(String errorDesc, Throwable cause) {
-            super(errorDesc, cause);
-        }
-    }
-
-    /**
-     * Exception for GlobalMetadata transfer failures to remote
+     * Exception for Remote state transfer.
      */
-    static class GlobalMetadataTransferException extends RuntimeException {
+    static class RemoteStateTransferException extends RuntimeException {
 
-        public GlobalMetadataTransferException(String errorDesc) {
+        public RemoteStateTransferException(String errorDesc) {
             super(errorDesc);
         }
 
-        public GlobalMetadataTransferException(String errorDesc, Throwable cause) {
+        public RemoteStateTransferException(String errorDesc, Throwable cause) {
             super(errorDesc, cause);
         }
     }
diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
index 4efd1b8a62970..65477051cdb30 100644
--- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
+++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java
@@ -68,6 +68,7 @@
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Function;
 import java.util.function.Supplier;
 import java.util.stream.Stream;
@@ -230,10 +231,17 @@ public void testWriteFullMetadataInParallelSuccess() throws IOException {
 
         ArgumentCaptor<ActionListener<Void>> actionListenerArgumentCaptor = ArgumentCaptor.forClass(ActionListener.class);
         ArgumentCaptor<WriteContext> writeContextArgumentCaptor = ArgumentCaptor.forClass(WriteContext.class);
-
+        AtomicReference<WriteContext> capturedWriteContext = new AtomicReference<>();
         doAnswer((i) -> {
             actionListenerArgumentCaptor.getValue().onResponse(null);
             return null;
+        }).doAnswer((i) -> {
+            actionListenerArgumentCaptor.getValue().onResponse(null);
+            capturedWriteContext.set(writeContextArgumentCaptor.getValue());
+            return null;
+        }).doAnswer((i) -> {
+            actionListenerArgumentCaptor.getValue().onResponse(null);
+            return null;
         }).when(container).asyncBlobUpload(writeContextArgumentCaptor.capture(), actionListenerArgumentCaptor.capture());
 
         remoteClusterStateService.start();
@@ -262,27 +270,30 @@ public void testWriteFullMetadataInParallelSuccess() throws IOException {
         assertThat(manifest.getStateUUID(), is(expectedManifest.getStateUUID()));
         assertThat(manifest.getPreviousClusterUUID(), is(expectedManifest.getPreviousClusterUUID()));
 
-        assertEquals(actionListenerArgumentCaptor.getAllValues().size(), 2);
-        assertEquals(writeContextArgumentCaptor.getAllValues().size(), 2);
+        assertEquals(actionListenerArgumentCaptor.getAllValues().size(), 3);
+        assertEquals(writeContextArgumentCaptor.getAllValues().size(), 3);
 
-        WriteContext capturedWriteContext = writeContextArgumentCaptor.getValue();
-        byte[] writtenBytes = capturedWriteContext.getStreamProvider(Integer.MAX_VALUE).provideStream(0).getInputStream().readAllBytes();
+        byte[] writtenBytes = capturedWriteContext.get()
+            .getStreamProvider(Integer.MAX_VALUE)
+            .provideStream(0)
+            .getInputStream()
+            .readAllBytes();
         IndexMetadata writtenIndexMetadata = RemoteClusterStateService.INDEX_METADATA_FORMAT.deserialize(
-            capturedWriteContext.getFileName(),
+            capturedWriteContext.get().getFileName(),
             blobStoreRepository.getNamedXContentRegistry(),
             new BytesArray(writtenBytes)
         );
 
-        assertEquals(capturedWriteContext.getWritePriority(), WritePriority.URGENT);
+        assertEquals(capturedWriteContext.get().getWritePriority(), WritePriority.URGENT);
         assertEquals(writtenIndexMetadata.getNumberOfShards(), 1);
         assertEquals(writtenIndexMetadata.getNumberOfReplicas(), 0);
         assertEquals(writtenIndexMetadata.getIndex().getName(), "test-index");
         assertEquals(writtenIndexMetadata.getIndex().getUUID(), "index-uuid");
         long expectedChecksum = RemoteTransferContainer.checksumOfChecksum(new ByteArrayIndexInput("metadata-filename", writtenBytes), 8);
-        if (capturedWriteContext.doRemoteDataIntegrityCheck()) {
-            assertEquals(capturedWriteContext.getExpectedChecksum().longValue(), expectedChecksum);
+        if (capturedWriteContext.get().doRemoteDataIntegrityCheck()) {
+            assertEquals(capturedWriteContext.get().getExpectedChecksum().longValue(), expectedChecksum);
         } else {
-            assertEquals(capturedWriteContext.getExpectedChecksum(), null);
+            assertEquals(capturedWriteContext.get().getExpectedChecksum(), null);
         }
 
     }
@@ -306,11 +317,44 @@ public void run() {
 
         remoteClusterStateService.start();
         assertThrows(
-            RemoteClusterStateService.GlobalMetadataTransferException.class,
+            RemoteClusterStateService.RemoteStateTransferException.class,
             () -> remoteClusterStateService.writeFullMetadata(clusterState, randomAlphaOfLength(10))
         );
     }
 
+    public void testTimeoutWhileWritingManifestFile() throws IOException {
+        // verify update metadata manifest upload timeout
+        int metadataManifestUploadTimeout = 2;
+        Settings newSettings = Settings.builder()
+            .put("cluster.remote_store.state.metadata_manifest.upload_timeout", metadataManifestUploadTimeout + "s")
+            .build();
+        clusterSettings.applySettings(newSettings);
+
+        final ClusterState clusterState = generateClusterStateWithOneIndex().nodes(nodesWithLocalNodeClusterManager()).build();
+        AsyncMultiStreamBlobContainer container = (AsyncMultiStreamBlobContainer) mockBlobStoreObjects(AsyncMultiStreamBlobContainer.class);
+
+        ArgumentCaptor<ActionListener<Void>> actionListenerArgumentCaptor = ArgumentCaptor.forClass(ActionListener.class);
+
+        doAnswer((i) -> { // For Global Metadata
+            actionListenerArgumentCaptor.getValue().onResponse(null);
+            return null;
+        }).doAnswer((i) -> { // For Index Metadata
+            actionListenerArgumentCaptor.getValue().onResponse(null);
+            return null;
+        }).doAnswer((i) -> {
+            // For Manifest file perform No Op, so latch in code will timeout
+            return null;
+        }).when(container).asyncBlobUpload(any(WriteContext.class), actionListenerArgumentCaptor.capture());
+
+        remoteClusterStateService.start();
+        try {
+            remoteClusterStateService.writeFullMetadata(clusterState, randomAlphaOfLength(10));
+        } catch (Exception e) {
+            assertTrue(e instanceof RemoteClusterStateService.RemoteStateTransferException);
+            assertTrue(e.getMessage().contains("Timed out waiting for transfer of manifest file to complete"));
+        }
+    }
+
     public void testWriteFullMetadataInParallelFailureForIndexMetadata() throws IOException {
         final ClusterState clusterState = generateClusterStateWithOneIndex().nodes(nodesWithLocalNodeClusterManager()).build();
         AsyncMultiStreamBlobContainer container = (AsyncMultiStreamBlobContainer) mockBlobStoreObjects(AsyncMultiStreamBlobContainer.class);
@@ -327,7 +371,7 @@ public void testWriteFullMetadataInParallelFailureForIndexMetadata() throws IOEx
 
         remoteClusterStateService.start();
         assertThrows(
-            RemoteClusterStateService.IndexMetadataTransferException.class,
+            RemoteClusterStateService.RemoteStateTransferException.class,
             () -> remoteClusterStateService.writeFullMetadata(clusterState, randomAlphaOfLength(10))
         );
         assertEquals(0, remoteClusterStateService.getStats().getSuccessCount());
@@ -1142,6 +1186,22 @@ public void testIndexMetadataUploadWaitTimeSetting() {
         assertEquals(indexMetadataUploadTimeout, remoteClusterStateService.getIndexMetadataUploadTimeout().seconds());
     }
 
+    public void testMetadataManifestUploadWaitTimeSetting() {
+        // verify default value
+        assertEquals(
+            RemoteClusterStateService.METADATA_MANIFEST_UPLOAD_TIMEOUT_DEFAULT,
+            remoteClusterStateService.getMetadataManifestUploadTimeout()
+        );
+
+        // verify update metadata manifest upload timeout
+        int metadataManifestUploadTimeout = randomIntBetween(1, 10);
+        Settings newSettings = Settings.builder()
+            .put("cluster.remote_store.state.metadata_manifest.upload_timeout", metadataManifestUploadTimeout + "s")
+            .build();
+        clusterSettings.applySettings(newSettings);
+        assertEquals(metadataManifestUploadTimeout, remoteClusterStateService.getMetadataManifestUploadTimeout().seconds());
+    }
+
     public void testGlobalMetadataUploadWaitTimeSetting() {
         // verify default value
         assertEquals(

From 4efa6d7a8dfdaba93571785315eeeb24956917f5 Mon Sep 17 00:00:00 2001
From: Sachin Kale <sachinpkale@gmail.com>
Date: Mon, 30 Oct 2023 14:58:59 +0530
Subject: [PATCH 30/33] Read the same medata file that is locked during restore
 of shallow snapshot (#10979)

Signed-off-by: Sachin Kale <kalsac@amazon.com>
---
 .../remotestore/RemoteRestoreSnapshotIT.java  | 67 +++++++++++++++++++
 .../opensearch/index/shard/IndexShard.java    |  3 +-
 .../opensearch/index/shard/StoreRecovery.java |  5 ++
 .../store/RemoteSegmentStoreDirectory.java    |  6 +-
 .../RemoteStoreMetadataLockManager.java       | 16 +++++
 5 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
index ad78c503a4a19..21ce4be9981fb 100644
--- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteRestoreSnapshotIT.java
@@ -589,4 +589,71 @@ public void testRestoreShallowSnapshotRepository() throws ExecutionException, In
         assertDocsPresentInIndex(client, restoredIndexName1, numDocsInIndex1 + 2);
     }
 
+    public void testRestoreShallowSnapshotIndexAfterSnapshot() throws ExecutionException, InterruptedException {
+        String indexName1 = "testindex1";
+        String snapshotRepoName = "test-restore-snapshot-repo";
+        String remoteStoreRepoNameUpdated = "test-rs-repo-updated" + TEST_REMOTE_STORE_REPO_SUFFIX;
+        String snapshotName1 = "test-restore-snapshot1";
+        Path absolutePath1 = randomRepoPath().toAbsolutePath();
+        Path absolutePath2 = randomRepoPath().toAbsolutePath();
+        String[] pathTokens = absolutePath1.toString().split("/");
+        String basePath = pathTokens[pathTokens.length - 1];
+        Arrays.copyOf(pathTokens, pathTokens.length - 1);
+        Path location = PathUtils.get(String.join("/", pathTokens));
+        pathTokens = absolutePath2.toString().split("/");
+        String basePath2 = pathTokens[pathTokens.length - 1];
+        Arrays.copyOf(pathTokens, pathTokens.length - 1);
+        Path location2 = PathUtils.get(String.join("/", pathTokens));
+        logger.info("Path 1 [{}]", absolutePath1);
+        logger.info("Path 2 [{}]", absolutePath2);
+        String restoredIndexName1 = indexName1 + "-restored";
+
+        createRepository(snapshotRepoName, "fs", getRepositorySettings(location, basePath, true));
+
+        Client client = client();
+        Settings indexSettings = Settings.builder()
+            .put(super.indexSettings())
+            .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
+            .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
+            .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
+            .build();
+        createIndex(indexName1, indexSettings);
+
+        int numDocsInIndex1 = randomIntBetween(2, 5);
+        indexDocuments(client, indexName1, numDocsInIndex1);
+
+        ensureGreen(indexName1);
+
+        logger.info("--> snapshot");
+        SnapshotInfo snapshotInfo1 = createSnapshot(snapshotRepoName, snapshotName1, new ArrayList<>(List.of(indexName1)));
+        assertThat(snapshotInfo1.successfulShards(), greaterThan(0));
+        assertThat(snapshotInfo1.successfulShards(), equalTo(snapshotInfo1.totalShards()));
+        assertThat(snapshotInfo1.state(), equalTo(SnapshotState.SUCCESS));
+
+        int extraNumDocsInIndex1 = randomIntBetween(20, 50);
+        indexDocuments(client, indexName1, extraNumDocsInIndex1);
+        refresh(indexName1);
+
+        client().admin().indices().close(Requests.closeIndexRequest(indexName1)).get();
+        createRepository(remoteStoreRepoNameUpdated, "fs", remoteRepoPath);
+        RestoreSnapshotResponse restoreSnapshotResponse2 = client.admin()
+            .cluster()
+            .prepareRestoreSnapshot(snapshotRepoName, snapshotName1)
+            .setWaitForCompletion(true)
+            .setIndices(indexName1)
+            .setRenamePattern(indexName1)
+            .setRenameReplacement(restoredIndexName1)
+            .setSourceRemoteStoreRepository(remoteStoreRepoNameUpdated)
+            .get();
+
+        assertTrue(restoreSnapshotResponse2.getRestoreInfo().failedShards() == 0);
+        ensureGreen(restoredIndexName1);
+        assertDocsPresentInIndex(client, restoredIndexName1, numDocsInIndex1);
+
+        // indexing some new docs and validating
+        indexDocuments(client, restoredIndexName1, numDocsInIndex1, numDocsInIndex1 + 2);
+        ensureGreen(restoredIndexName1);
+        assertDocsPresentInIndex(client, restoredIndexName1, numDocsInIndex1 + 2);
+    }
+
 }
diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
index 32396f1a3df2e..cf42c6749fc79 100644
--- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java
+++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java
@@ -4908,8 +4908,7 @@ public void syncSegmentsFromGivenRemoteSegmentStore(
             remoteStore.incRef();
         }
         Map<String, RemoteSegmentStoreDirectory.UploadedSegmentMetadata> uploadedSegments = sourceRemoteDirectory
-            .initializeToSpecificCommit(primaryTerm, commitGeneration)
-            .getMetadata();
+            .getSegmentsUploadedToRemoteStore();
         final Directory storeDirectory = store.directory();
         store.incRef();
 
diff --git a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
index e823401e5ef7e..5b1940bb1d9a5 100644
--- a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
+++ b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java
@@ -410,6 +410,11 @@ void recoverFromSnapshotAndRemoteStore(
                     indexUUID,
                     shardId
                 );
+                sourceRemoteDirectory.initializeToSpecificCommit(
+                    primaryTerm,
+                    commitGeneration,
+                    recoverySource.snapshot().getSnapshotId().getUUID()
+                );
                 indexShard.syncSegmentsFromGivenRemoteSegmentStore(true, sourceRemoteDirectory, primaryTerm, commitGeneration);
                 final Store store = indexShard.store();
                 if (indexShard.indexSettings.isRemoteTranslogStoreEnabled() == false) {
diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java
index be1f2341236ab..988d52202f975 100644
--- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java
+++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java
@@ -34,6 +34,7 @@
 import org.opensearch.index.store.lockmanager.FileLockInfo;
 import org.opensearch.index.store.lockmanager.RemoteStoreCommitLevelLockManager;
 import org.opensearch.index.store.lockmanager.RemoteStoreLockManager;
+import org.opensearch.index.store.lockmanager.RemoteStoreMetadataLockManager;
 import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata;
 import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadataHandler;
 import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
@@ -160,8 +161,9 @@ public RemoteSegmentMetadata init() throws IOException {
      *
      * @throws IOException if there were any failures in reading the metadata file
      */
-    public RemoteSegmentMetadata initializeToSpecificCommit(long primaryTerm, long commitGeneration) throws IOException {
-        String metadataFile = getMetadataFileForCommit(primaryTerm, commitGeneration);
+    public RemoteSegmentMetadata initializeToSpecificCommit(long primaryTerm, long commitGeneration, String acquirerId) throws IOException {
+        String metadataFilePrefix = MetadataFilenameUtils.getMetadataFilePrefixForCommit(primaryTerm, commitGeneration);
+        String metadataFile = ((RemoteStoreMetadataLockManager) mdLockManager).fetchLock(metadataFilePrefix, acquirerId);
         RemoteSegmentMetadata remoteSegmentMetadata = readMetadataFile(metadataFile);
         if (remoteSegmentMetadata != null) {
             this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(remoteSegmentMetadata.getMetadata());
diff --git a/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreMetadataLockManager.java b/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreMetadataLockManager.java
index fd7906729e314..756905d02229a 100644
--- a/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreMetadataLockManager.java
+++ b/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreMetadataLockManager.java
@@ -14,10 +14,13 @@
 import org.apache.lucene.store.IndexOutput;
 import org.opensearch.index.store.RemoteBufferedOutputDirectory;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.nio.file.NoSuchFileException;
 import java.util.Collection;
+import java.util.List;
 import java.util.Objects;
+import java.util.stream.Collectors;
 
 /**
  * A Class that implements Remote Store Lock Manager by creating lock files for the remote store files that needs to
@@ -70,6 +73,19 @@ public void release(LockInfo lockInfo) throws IOException {
         }
     }
 
+    public String fetchLock(String filenamePrefix, String acquirerId) throws IOException {
+        Collection<String> lockFiles = lockDirectory.listFilesByPrefix(filenamePrefix);
+        List<String> lockFilesForAcquirer = lockFiles.stream()
+            .filter(lockFile -> acquirerId.equals(FileLockInfo.LockFileUtils.getAcquirerIdFromLock(lockFile)))
+            .map(FileLockInfo.LockFileUtils::getFileToLockNameFromLock)
+            .collect(Collectors.toList());
+        if (lockFilesForAcquirer.size() == 0) {
+            throw new FileNotFoundException("No lock file found for prefix: " + filenamePrefix + " and acquirerId: " + acquirerId);
+        }
+        assert lockFilesForAcquirer.size() == 1;
+        return lockFilesForAcquirer.get(0);
+    }
+
     /**
      * Checks whether a given file have any lock on it or not.
      * @param lockInfo File Lock Info instance for which we need to check if lock is acquired.

From 0d7d1e9db7ace8f6e90ab98ade88d719efaf37e1 Mon Sep 17 00:00:00 2001
From: Andriy Redko <andriy.redko@aiven.io>
Date: Mon, 30 Oct 2023 09:15:17 -0400
Subject: [PATCH 31/33] Update bundled JDK to JDK-21.0.1 (#10576)

Signed-off-by: Andriy Redko <andriy.redko@aiven.io>
---
 .../java/org/opensearch/gradle/test/DistroTestPlugin.java     | 4 ++--
 buildSrc/version.properties                                   | 4 +---
 distribution/src/config/jvm.options                           | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/buildSrc/src/main/java/org/opensearch/gradle/test/DistroTestPlugin.java b/buildSrc/src/main/java/org/opensearch/gradle/test/DistroTestPlugin.java
index a420c8b63b02c..1ad7e056b6ae6 100644
--- a/buildSrc/src/main/java/org/opensearch/gradle/test/DistroTestPlugin.java
+++ b/buildSrc/src/main/java/org/opensearch/gradle/test/DistroTestPlugin.java
@@ -77,9 +77,9 @@
 import java.util.stream.Stream;
 
 public class DistroTestPlugin implements Plugin<Project> {
-    private static final String SYSTEM_JDK_VERSION = "11.0.20+8";
+    private static final String SYSTEM_JDK_VERSION = "17.0.9+9";
     private static final String SYSTEM_JDK_VENDOR = "adoptium";
-    private static final String GRADLE_JDK_VERSION = "17.0.8+7";
+    private static final String GRADLE_JDK_VERSION = "17.0.9+9";
     private static final String GRADLE_JDK_VENDOR = "adoptium";
 
     // all distributions used by distro tests. this is temporary until tests are per distribution
diff --git a/buildSrc/version.properties b/buildSrc/version.properties
index 96d398c35851d..0d98cba35448f 100644
--- a/buildSrc/version.properties
+++ b/buildSrc/version.properties
@@ -2,9 +2,7 @@ opensearch        = 3.0.0
 lucene            = 9.8.0
 
 bundled_jdk_vendor = adoptium
-bundled_jdk = 20.0.2+9
-# See please https://github.com/adoptium/temurin-build/issues/3371
-bundled_jdk_linux_ppc64le = 20+36
+bundled_jdk = 21.0.1+12
 
 # optional dependencies
 spatial4j         = 0.7
diff --git a/distribution/src/config/jvm.options b/distribution/src/config/jvm.options
index 952110c6c0289..1a0abcbaf9c88 100644
--- a/distribution/src/config/jvm.options
+++ b/distribution/src/config/jvm.options
@@ -81,7 +81,7 @@ ${error.file}
 
 # JDK 20+ Incubating Vector Module for SIMD optimizations;
 # disabling may reduce performance on vector optimized lucene
-20:--add-modules=jdk.incubator.vector
+20-:--add-modules=jdk.incubator.vector
 
 # HDFS ForkJoinPool.common() support by SecurityManager
 -Djava.util.concurrent.ForkJoinPool.common.threadFactory=org.opensearch.secure_sm.SecuredForkJoinWorkerThreadFactory

From 7c917c5628d8c33a7316bd30e7b8c5c70419f8a1 Mon Sep 17 00:00:00 2001
From: Sachin Kale <sachinpkale@gmail.com>
Date: Mon, 30 Oct 2023 19:06:51 +0530
Subject: [PATCH 32/33] Log more info with max seq no on upload to and download
 from remote translog (#10973)

* Log more info with max seq no on upload to and download from remote translog

Signed-off-by: Sachin Kale <kalsac@amazon.com>

* Change log level to debug

Signed-off-by: Sachin Kale <kalsac@amazon.com>

* Spotless fixes

Signed-off-by: Sachin Kale <kalsac@amazon.com>

* Fix UTs

Signed-off-by: Sachin Kale <kalsac@amazon.com>

---------

Signed-off-by: Sachin Kale <kalsac@amazon.com>
Co-authored-by: Sachin Kale <kalsac@amazon.com>
---
 .../index/translog/RemoteFsTranslog.java      | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java
index a305a774f5854..65d16e213cad1 100644
--- a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java
+++ b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java
@@ -103,6 +103,7 @@ public RemoteFsTranslog(
         try {
             download(translogTransferManager, location, logger);
             Checkpoint checkpoint = readCheckpoint(location);
+            logger.info("Downloaded data from remote translog till maxSeqNo = {}", checkpoint.maxSeqNo);
             this.readers.addAll(recoverFromFiles(checkpoint));
             if (readers.isEmpty()) {
                 String errorMsg = String.format(Locale.ROOT, "%s at least one reader must be recovered", shardId);
@@ -266,9 +267,13 @@ public void rollGeneration() throws IOException {
     }
 
     private boolean prepareAndUpload(Long primaryTerm, Long generation) throws IOException {
+        long maxSeqNo = -1;
         try (Releasable ignored = writeLock.acquire()) {
             if (generation == null || generation == current.getGeneration()) {
                 try {
+                    if (closed.get() == false) {
+                        maxSeqNo = getMaxSeqNo();
+                    }
                     final TranslogReader reader = current.closeIntoReader();
                     readers.add(reader);
                     copyCheckpointTo(location.resolve(getCommitCheckpointFileName(current.getGeneration())));
@@ -300,17 +305,17 @@ private boolean prepareAndUpload(Long primaryTerm, Long generation) throws IOExc
             // is not updated in remote translog except in primary to primary recovery.
             if (generation == null) {
                 if (closed.get() == false) {
-                    return upload(primaryTerm, current.getGeneration() - 1);
+                    return upload(primaryTerm, current.getGeneration() - 1, maxSeqNo);
                 } else {
-                    return upload(primaryTerm, current.getGeneration());
+                    return upload(primaryTerm, current.getGeneration(), maxSeqNo);
                 }
             } else {
-                return upload(primaryTerm, generation);
+                return upload(primaryTerm, generation, maxSeqNo);
             }
         }
     }
 
-    private boolean upload(Long primaryTerm, Long generation) throws IOException {
+    private boolean upload(long primaryTerm, long generation, long maxSeqNo) throws IOException {
         // During primary relocation (primary-primary peer recovery), both the old and the new primary have engine
         // created with the RemoteFsTranslog. Both primaries are equipped to upload the translogs. The primary mode check
         // below ensures that the real primary only is uploading. Before the primary mode is set as true for the new
@@ -334,7 +339,7 @@ private boolean upload(Long primaryTerm, Long generation) throws IOException {
         ) {
             return translogTransferManager.transferSnapshot(
                 transferSnapshotProvider,
-                new RemoteFsTranslogTransferListener(generation, primaryTerm)
+                new RemoteFsTranslogTransferListener(generation, primaryTerm, maxSeqNo)
             );
         }
 
@@ -522,23 +527,31 @@ private class RemoteFsTranslogTransferListener implements TranslogTransferListen
         /**
          * Generation for the translog
          */
-        private final Long generation;
+        private final long generation;
 
         /**
          * Primary Term for the translog
          */
-        private final Long primaryTerm;
+        private final long primaryTerm;
+
+        private final long maxSeqNo;
 
-        RemoteFsTranslogTransferListener(Long generation, Long primaryTerm) {
+        RemoteFsTranslogTransferListener(long generation, long primaryTerm, long maxSeqNo) {
             this.generation = generation;
             this.primaryTerm = primaryTerm;
+            this.maxSeqNo = maxSeqNo;
         }
 
         @Override
         public void onUploadComplete(TransferSnapshot transferSnapshot) throws IOException {
             maxRemoteTranslogGenerationUploaded = generation;
             minRemoteGenReferenced = getMinFileGeneration();
-            logger.trace("uploaded translog for {} {} ", primaryTerm, generation);
+            logger.debug(
+                "Successfully uploaded translog for primary term = {}, generation = {}, maxSeqNo = {}",
+                primaryTerm,
+                generation,
+                maxSeqNo
+            );
         }
 
         @Override

From 448635f77855108afedfe6f2e5c07a2f6c37746c Mon Sep 17 00:00:00 2001
From: Chaitanya Gohel <104654647+gashutos@users.noreply.github.com>
Date: Mon, 30 Oct 2023 20:51:16 +0530
Subject: [PATCH 33/33] Disable sort optimization for HALF_FLOAT (#10999)

Signed-off-by: Chaitanya Gohel <gashutos@amazon.com>
---
 CHANGELOG.md                                                    | 1 +
 .../org/opensearch/index/fielddata/IndexNumericFieldData.java   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 020fb5bda8b8b..c18ff830f84cc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -122,6 +122,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [BUG] Fix java.lang.SecurityException in repository-gcs plugin ([#10642](https://github.com/opensearch-project/OpenSearch/pull/10642))
 - Add telemetry tracer/metric enable flag and integ test. ([#10395](https://github.com/opensearch-project/OpenSearch/pull/10395))
 - Add instrumentation for indexing in transport bulk action and transport shard bulk action. ([#10273](https://github.com/opensearch-project/OpenSearch/pull/10273))
+- [BUG] Disable sort optimization for HALF_FLOAT ([#10999](https://github.com/opensearch-project/OpenSearch/pull/10999))
 
 ### Deprecated
 
diff --git a/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java b/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java
index b4e90b8ab570a..6fc074fe0de95 100644
--- a/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java
+++ b/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java
@@ -242,7 +242,7 @@ private XFieldComparatorSource comparatorSource(
                 assert !targetNumericType.isFloatingPoint();
                 source = new IntValuesComparatorSource(this, missingValue, sortMode, nested);
         }
-        if (targetNumericType != getNumericType()) {
+        if (targetNumericType != getNumericType() || getNumericType() == NumericType.HALF_FLOAT) {
             source.disableSkipping(); // disable skipping logic for cast of sort field
         }
         return source;