From 4017da70b850a7aa0e20f996cb42cf02bee3fd8c Mon Sep 17 00:00:00 2001 From: Sachin Kale Date: Tue, 3 Oct 2023 23:26:33 +0530 Subject: [PATCH 1/2] Retry on failure to acquire lock on remote metadata Signed-off-by: Sachin Kale --- .../snapshots/SnapshotShardsService.java | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java index af2f925f89726..d208e323b3012 100644 --- a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java +++ b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java @@ -37,6 +37,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.index.IndexCommit; import org.opensearch.Version; +import org.opensearch.action.admin.indices.flush.FlushRequest; import org.opensearch.cluster.ClusterChangedEvent; import org.opensearch.cluster.ClusterStateListener; import org.opensearch.cluster.SnapshotsInProgress; @@ -73,6 +74,7 @@ import org.opensearch.transport.TransportService; import java.io.IOException; +import java.nio.file.NoSuchFileException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -401,18 +403,32 @@ private void snapshot( try { if (remoteStoreIndexShallowCopy && indexShard.indexSettings().isRemoteStoreEnabled()) { long startTime = threadPool.relativeTimeInMillis(); + long primaryTerm = indexShard.getOperationPrimaryTerm(); // we flush first to make sure we get the latest writes snapshotted wrappedSnapshot = indexShard.acquireLastIndexCommitAndRefresh(true); - long primaryTerm = indexShard.getOperationPrimaryTerm(); - final IndexCommit snapshotIndexCommit = wrappedSnapshot.get(); + IndexCommit snapshotIndexCommit = wrappedSnapshot.get(); long commitGeneration = snapshotIndexCommit.getGeneration(); - indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + try { + indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + } catch (NoSuchFileException e) { + wrappedSnapshot.close(); + logger.info( + "Exception while acquiring lock on primaryTerm = {} and generation = {}", + primaryTerm, + commitGeneration + ); + indexShard.flush(new FlushRequest(shardId.getIndexName()).force(true)); + wrappedSnapshot = indexShard.acquireLastIndexCommit(false); + snapshotIndexCommit = wrappedSnapshot.get(); + commitGeneration = snapshotIndexCommit.getGeneration(); + indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + } try { repository.snapshotRemoteStoreIndexShard( indexShard.store(), snapshot.getSnapshotId(), indexId, - wrappedSnapshot.get(), + snapshotIndexCommit, getShardStateId(indexShard, snapshotIndexCommit), snapshotStatus, primaryTerm, From a186d218c2cd30e8f1d4297fe6c3cea158548881 Mon Sep 17 00:00:00 2001 From: Sachin Kale Date: Wed, 4 Oct 2023 08:53:44 +0530 Subject: [PATCH 2/2] Change log level Signed-off-by: Sachin Kale --- .../java/org/opensearch/snapshots/SnapshotShardsService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java index d208e323b3012..1c25d8c71f948 100644 --- a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java +++ b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java @@ -412,7 +412,7 @@ private void snapshot( indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); } catch (NoSuchFileException e) { wrappedSnapshot.close(); - logger.info( + logger.warn( "Exception while acquiring lock on primaryTerm = {} and generation = {}", primaryTerm, commitGeneration