From 677ff7513598d5622285a31bd1dd7011abd45bf7 Mon Sep 17 00:00:00 2001 From: Sachin Kale Date: Wed, 4 Oct 2023 18:47:00 +0530 Subject: [PATCH] Retry on failure to acquire lock on remote metadata (#10341) Signed-off-by: Sachin Kale --- .../snapshots/SnapshotShardsService.java | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java index af2f925f89726..1c25d8c71f948 100644 --- a/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java +++ b/server/src/main/java/org/opensearch/snapshots/SnapshotShardsService.java @@ -37,6 +37,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.index.IndexCommit; import org.opensearch.Version; +import org.opensearch.action.admin.indices.flush.FlushRequest; import org.opensearch.cluster.ClusterChangedEvent; import org.opensearch.cluster.ClusterStateListener; import org.opensearch.cluster.SnapshotsInProgress; @@ -73,6 +74,7 @@ import org.opensearch.transport.TransportService; import java.io.IOException; +import java.nio.file.NoSuchFileException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -401,18 +403,32 @@ private void snapshot( try { if (remoteStoreIndexShallowCopy && indexShard.indexSettings().isRemoteStoreEnabled()) { long startTime = threadPool.relativeTimeInMillis(); + long primaryTerm = indexShard.getOperationPrimaryTerm(); // we flush first to make sure we get the latest writes snapshotted wrappedSnapshot = indexShard.acquireLastIndexCommitAndRefresh(true); - long primaryTerm = indexShard.getOperationPrimaryTerm(); - final IndexCommit snapshotIndexCommit = wrappedSnapshot.get(); + IndexCommit snapshotIndexCommit = wrappedSnapshot.get(); long commitGeneration = snapshotIndexCommit.getGeneration(); - indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + try { + indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + } catch (NoSuchFileException e) { + wrappedSnapshot.close(); + logger.warn( + "Exception while acquiring lock on primaryTerm = {} and generation = {}", + primaryTerm, + commitGeneration + ); + indexShard.flush(new FlushRequest(shardId.getIndexName()).force(true)); + wrappedSnapshot = indexShard.acquireLastIndexCommit(false); + snapshotIndexCommit = wrappedSnapshot.get(); + commitGeneration = snapshotIndexCommit.getGeneration(); + indexShard.acquireLockOnCommitData(snapshot.getSnapshotId().getUUID(), primaryTerm, commitGeneration); + } try { repository.snapshotRemoteStoreIndexShard( indexShard.store(), snapshot.getSnapshotId(), indexId, - wrappedSnapshot.get(), + snapshotIndexCommit, getShardStateId(indexShard, snapshotIndexCommit), snapshotStatus, primaryTerm,