From 54cef09b09c7a70c68c2f7e977b4468effd70dd7 Mon Sep 17 00:00:00 2001
From: Armin Braun <me@obrown.io>
Date: Wed, 22 Jan 2020 10:33:26 +0100
Subject: [PATCH] Fix Overly Optimistic Request Deduplication (#51270)

On master failover we have to resent all the shard failed messages,
but the transport requests remain the same in the eyes of `equals`.
If the master failover is registered and the requests to the new master
are sent before all the callbacks have executed and the request to the
old master removed from the deduplicator then the requuests to the new
master will incorrectly fail and the snapshot get stuck.

Closes #51253
---
 .../elasticsearch/snapshots/SnapshotShardsService.java    | 3 +++
 .../transport/TransportRequestDeduplicator.java           | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
index 4b205a55370d5..e0b9a4e48aced 100644
--- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
+++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
@@ -357,6 +357,9 @@ private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
             return;
         }
 
+        // Clear request deduplicator since we need to send all requests that were potentially not handled by the previous
+        // master again
+        remoteFailedRequestDeduplicator.clear();
         for (SnapshotsInProgress.Entry snapshot : snapshotsInProgress.entries()) {
             if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
                 Map<ShardId, IndexShardSnapshotStatus> localShards = currentSnapshotShards(snapshot.snapshot());
diff --git a/server/src/main/java/org/elasticsearch/transport/TransportRequestDeduplicator.java b/server/src/main/java/org/elasticsearch/transport/TransportRequestDeduplicator.java
index d929ef34ce2c3..6249975bf3ef6 100644
--- a/server/src/main/java/org/elasticsearch/transport/TransportRequestDeduplicator.java
+++ b/server/src/main/java/org/elasticsearch/transport/TransportRequestDeduplicator.java
@@ -53,6 +53,14 @@ public void executeOnce(T request, ActionListener<Void> listener, BiConsumer<T,
         }
     }
 
+    /**
+     * Remove all tracked requests from this instance so that the first time {@link #executeOnce} is invoked with any request it triggers
+     * an actual request execution. Use this e.g. for requests to master that need to be sent again on master failover.
+     */
+    public void clear() {
+        requests.clear();
+    }
+
     public int size() {
         return requests.size();
     }