From 5f88febe56da26943e1de9cea009193023f08b9b Mon Sep 17 00:00:00 2001
From: Pavel <aborilov@gmail.com>
Date: Fri, 3 Nov 2023 01:07:50 +0800
Subject: [PATCH] feat: add retry logic for k8s client #7692 (#16154)

* add retry logic for k8s client

Signed-off-by: Pavel Aborilov <aborilov@gmail.com>

* add docs for retry logic and envs to manifests

Signed-off-by: Pavel Aborilov <aborilov@gmail.com>

---------

Signed-off-by: Pavel Aborilov <aborilov@gmail.com>
Signed-off-by: Pavel <aborilov@gmail.com>
---
 .../operator-manual/argocd-cmd-params-cm.yaml | 10 +++
 docs/operator-manual/high_availability.md     | 88 +++++++++++++++++++
 ...cd-application-controller-statefulset.yaml | 12 +++
 .../base/server/argocd-server-deployment.yaml | 12 +++
 manifests/core-install.yaml                   | 12 +++
 .../ha/base/redis-ha/chart/upstream.yaml      | 52 ++---------
 manifests/ha/install.yaml                     | 24 +++++
 manifests/ha/namespace-install.yaml           | 24 +++++
 manifests/install.yaml                        | 24 +++++
 manifests/namespace-install.yaml              | 24 +++++
 pkg/apis/application/v1alpha1/types.go        | 10 ++-
 util/http/http.go                             | 87 +++++++++++++++++-
 12 files changed, 327 insertions(+), 52 deletions(-)

diff --git a/docs/operator-manual/argocd-cmd-params-cm.yaml b/docs/operator-manual/argocd-cmd-params-cm.yaml
index c752f342dcf50..22c6e3762e69a 100644
--- a/docs/operator-manual/argocd-cmd-params-cm.yaml
+++ b/docs/operator-manual/argocd-cmd-params-cm.yaml
@@ -58,6 +58,12 @@ data:
   controller.sharding.algorithm: legacy
   # Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.
   controller.kubectl.parallelism.limit: "20"
+  # The maximum number of retries for each request
+  controller.k8sclient.retry.max: "0"
+  # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold
+  controller.k8sclient.retry.base.backoff: "100"
+  # Grace period in seconds for ignoring consecutive errors while communicating with repo server.
+  controller.repo.error.grace.period.seconds: "180"
 
   ## Server properties
   # Listen on given address for incoming connections (default "0.0.0.0")
@@ -75,6 +81,10 @@ data:
   # Semicolon-separated list of content types allowed on non-GET requests. Set an empty string to allow all. Be aware
   # that allowing content types besides application/json may make your API more vulnerable to CSRF attacks.
   server.api.content.types: "application/json"
+  # The maximum number of retries for each request
+  server.k8sclient.retry.max: "0"
+  # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold
+  server.k8sclient.retry.base.backoff: "100"
 
   # Set the logging format. One of: text|json (default "text")
   server.log.format: "text"
diff --git a/docs/operator-manual/high_availability.md b/docs/operator-manual/high_availability.md
index eaa000b5d96d5..ef3964e2761a5 100644
--- a/docs/operator-manual/high_availability.md
+++ b/docs/operator-manual/high_availability.md
@@ -229,3 +229,91 @@ spec:
     path: my-application
 # ...
 ```
+
+## Rate Limiting Application Reconciliations
+
+To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors, 
+we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured:
+
+  * Global rate limits
+  * Per item rate limits
+
+The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`. 
+
+### Global rate limits
+
+  This is enabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second.
+This is useful to prevent a large number of apps from being queued at the same time. 
+  
+To configure the bucket limiter you can set the following environment variables:
+
+  * `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500.
+  * `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to 50.
+
+### Per item rate limits 
+
+  This by default returns a fixed base delay/backoff value but can be configured to return exponential values, read further to understand it's working. 
+Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially 
+if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued.
+
+To configure the per item limiter you can set the following environment variables:
+
+  * `WORKQUEUE_FAILURE_COOLDOWN_NS` : The cool down period in nanoseconds, once period has elapsed for an item the backoff is reset. Exponential backoff is disabled if set to 0(default), eg. values : 10 * 10^9 (=10s)
+  * `WORKQUEUE_BASE_DELAY_NS` : The base delay in nanoseconds, this is the initial backoff used in the exponential backoff formula. Defaults to 1000 (=1μs)
+  * `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s)
+  * `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5
+
+The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued 
+and `lastRequeueTime` is the time at which the item was last queued:
+
+- When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 :
+
+```
+backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ? 
+          WORKQUEUE_BASE_DELAY_NS : 
+          min(
+              WORKQUEUE_MAX_DELAY_NS, 
+              WORKQUEUE_BASE_DELAY_NS * WORKQUEUE_BACKOFF_FACTOR ^ (numRequeue)
+              )
+```
+
+- When `WORKQUEUE_FAILURE_COOLDOWN_NS` = 0 :
+
+```
+backoff = WORKQUEUE_BASE_DELAY_NS
+```
+
+## HTTP Request Retry Strategy
+
+In scenarios where network instability or transient server errors occur, the retry strategy ensures the robustness of HTTP communication by automatically resending failed requests. It uses a combination of maximum retries and backoff intervals to prevent overwhelming the server or thrashing the network.
+
+### Configuring Retries
+
+The retry logic can be fine-tuned with the following environment variables:
+
+* `ARGOCD_K8SCLIENT_RETRY_MAX` - The maximum number of retries for each request. The request will be dropped after this count is reached. Defaults to 0 (no retries).
+* `ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF` - The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold. Defaults to 100ms.
+
+### Backoff Strategy
+
+The backoff strategy employed is a simple exponential backoff without jitter. The backoff time increases exponentially with each retry attempt until a maximum backoff duration is reached.
+
+The formula for calculating the backoff time is:
+
+```
+backoff = min(retryWaitMax, baseRetryBackoff * (2 ^ retryAttempt))
+```
+Where `retryAttempt` starts at 0 and increments by 1 for each subsequent retry.
+
+### Maximum Wait Time
+
+There is a cap on the backoff time to prevent excessive wait times between retries. This cap is defined by:
+
+`retryWaitMax` - The maximum duration to wait before retrying. This ensures that retries happen within a reasonable timeframe. Defaults to 10 seconds.
+
+### Non-Retriable Conditions
+
+Not all HTTP responses are eligible for retries. The following conditions will not trigger a retry:
+
+* Responses with a status code indicating client errors (4xx) except for 429 Too Many Requests.
+* Responses with the status code 501 Not Implemented.
diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml
index c1573ec3a3501..53036177b934d 100644
--- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml
+++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml
@@ -161,6 +161,18 @@ spec:
                 name: argocd-cmd-params-cm
                 key: controller.ignore.normalizer.jq.timeout
                 optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+              configMapKeyRef:
+                name: argocd-cmd-params-cm
+                key: controller.k8sclient.retry.max
+                optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+              configMapKeyRef:
+                name: argocd-cmd-params-cm
+                key: controller.k8sclient.retry.base.backoff
+                optional: true
         image: quay.io/argoproj/argocd:latest
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/manifests/base/server/argocd-server-deployment.yaml b/manifests/base/server/argocd-server-deployment.yaml
index 45b2cdb8df285..20b59f1b77ccb 100644
--- a/manifests/base/server/argocd-server-deployment.yaml
+++ b/manifests/base/server/argocd-server-deployment.yaml
@@ -233,6 +233,18 @@ spec:
               name: argocd-cmd-params-cm
               key: server.api.content.types
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+              configMapKeyRef:
+                name: argocd-cmd-params-cm
+                key: server.k8sclient.retry.max
+                optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+              configMapKeyRef:
+                name: argocd-cmd-params-cm
+                key: server.k8sclient.retry.base.backoff
+                optional: true
         volumeMounts:
         - name: ssh-known-hosts
           mountPath: /app/config/ssh
diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml
index 181e046be180e..3113115d15114 100644
--- a/manifests/core-install.yaml
+++ b/manifests/core-install.yaml
@@ -19451,6 +19451,18 @@ spec:
               key: controller.ignore.normalizer.jq.timeout
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/manifests/ha/base/redis-ha/chart/upstream.yaml b/manifests/ha/base/redis-ha/chart/upstream.yaml
index a94bba0f09062..ca6fb9f737473 100644
--- a/manifests/ha/base/redis-ha/chart/upstream.yaml
+++ b/manifests/ha/base/redis-ha/chart/upstream.yaml
@@ -1080,13 +1080,7 @@ spec:
         args:
         - /readonly/haproxy_init.sh
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          seccompProfile:
-            type: RuntimeDefault
+          null
         volumeMounts:
         - name: config-volume
           mountPath: /readonly
@@ -1098,13 +1092,7 @@ spec:
         image: haproxy:2.6.14-alpine
         imagePullPolicy: IfNotPresent
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          seccompProfile:
-            type: RuntimeDefault
+          null
         livenessProbe:
           httpGet:
             path: /healthz
@@ -1200,14 +1188,7 @@ spec:
         args:
         - /readonly-config/init.sh
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          runAsUser: 1000
-          seccompProfile:
-            type: RuntimeDefault        
+          null        
         env:
         - name: SENTINEL_ID_0
           value: 3c0d9c0320bb34888c2df5757c718ce6ca992ce6
@@ -1232,14 +1213,7 @@ spec:
         args:
         - /data/conf/redis.conf
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          runAsUser: 1000
-          seccompProfile:
-            type: RuntimeDefault
+          null
         livenessProbe:
           initialDelaySeconds: 30
           periodSeconds: 15
@@ -1289,14 +1263,7 @@ spec:
         args:
           - /data/conf/sentinel.conf
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          runAsUser: 1000
-          seccompProfile:
-            type: RuntimeDefault
+          null
         livenessProbe:
           initialDelaySeconds: 30
           periodSeconds: 15
@@ -1340,14 +1307,7 @@ spec:
         args:
           - /readonly-config/fix-split-brain.sh
         securityContext: 
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-            - ALL
-          runAsNonRoot: true
-          runAsUser: 1000
-          seccompProfile:
-            type: RuntimeDefault        
+          null        
         env:
         - name: SENTINEL_ID_0
           value: 3c0d9c0320bb34888c2df5757c718ce6ca992ce6
diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml
index 021539b1f0d15..535f5e54d0dc0 100644
--- a/manifests/ha/install.yaml
+++ b/manifests/ha/install.yaml
@@ -20995,6 +20995,18 @@ spec:
               key: server.api.content.types
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         livenessProbe:
@@ -21247,6 +21259,18 @@ spec:
               key: controller.ignore.normalizer.jq.timeout
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml
index adde5cc188fd5..f51e5e2bb531f 100644
--- a/manifests/ha/namespace-install.yaml
+++ b/manifests/ha/namespace-install.yaml
@@ -2507,6 +2507,18 @@ spec:
               key: server.api.content.types
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         livenessProbe:
@@ -2759,6 +2771,18 @@ spec:
               key: controller.ignore.normalizer.jq.timeout
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/manifests/install.yaml b/manifests/install.yaml
index 2f9e0078141f1..673155d3ac658 100644
--- a/manifests/install.yaml
+++ b/manifests/install.yaml
@@ -20050,6 +20050,18 @@ spec:
               key: server.api.content.types
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         livenessProbe:
@@ -20302,6 +20314,18 @@ spec:
               key: controller.ignore.normalizer.jq.timeout
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml
index 74d395205f07a..1751b07523037 100644
--- a/manifests/namespace-install.yaml
+++ b/manifests/namespace-install.yaml
@@ -1562,6 +1562,18 @@ spec:
               key: server.api.content.types
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: server.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         livenessProbe:
@@ -1814,6 +1826,18 @@ spec:
               key: controller.ignore.normalizer.jq.timeout
               name: argocd-cmd-params-cm
               optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_MAX
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.max
+              name: argocd-cmd-params-cm
+              optional: true
+        - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF
+          valueFrom:
+            configMapKeyRef:
+              key: controller.k8sclient.retry.base.backoff
+              name: argocd-cmd-params-cm
+              optional: true
         image: quay.io/argoproj/argocd:v2.8.17
         imagePullPolicy: Always
         name: argocd-application-controller
diff --git a/pkg/apis/application/v1alpha1/types.go b/pkg/apis/application/v1alpha1/types.go
index 2eb5166c7d1ea..e37a12b656c65 100644
--- a/pkg/apis/application/v1alpha1/types.go
+++ b/pkg/apis/application/v1alpha1/types.go
@@ -35,11 +35,11 @@ import (
 	"k8s.io/client-go/tools/clientcmd/api"
 	"sigs.k8s.io/yaml"
 
-	"github.com/argoproj/argo-cd/v2/util/env"
-
 	"github.com/argoproj/argo-cd/v2/common"
 	"github.com/argoproj/argo-cd/v2/util/collections"
+	"github.com/argoproj/argo-cd/v2/util/env"
 	"github.com/argoproj/argo-cd/v2/util/helm"
+	utilhttp "github.com/argoproj/argo-cd/v2/util/http"
 	"github.com/argoproj/argo-cd/v2/util/security"
 )
 
@@ -2850,6 +2850,12 @@ func SetK8SConfigDefaults(config *rest.Config) error {
 	config.Timeout = K8sServerSideTimeout
 
 	config.Transport = tr
+	maxRetries := env.ParseInt64FromEnv(utilhttp.EnvRetryMax, 0, 1, math.MaxInt64)
+	if maxRetries > 0 {
+		backoffDurationMS := env.ParseInt64FromEnv(utilhttp.EnvRetryBaseBackoff, 100, 1, math.MaxInt64)
+		backoffDuration := time.Duration(backoffDurationMS) * time.Millisecond
+		config.WrapTransport = utilhttp.WithRetry(maxRetries, backoffDuration)
+	}
 	return nil
 }
 
diff --git a/util/http/http.go b/util/http/http.go
index 919e57a89e296..7c13c71fde223 100644
--- a/util/http/http.go
+++ b/util/http/http.go
@@ -1,21 +1,32 @@
 package http
 
 import (
+	"bytes"
 	"fmt"
+	"io"
 	"math"
 	"net/http"
 	"net/http/httputil"
 	"strconv"
 	"strings"
+	"time"
 
-	"github.com/argoproj/argo-cd/v2/util/env"
+	log "github.com/sirupsen/logrus"
+	"k8s.io/client-go/transport"
 
 	"github.com/argoproj/argo-cd/v2/common"
-
-	log "github.com/sirupsen/logrus"
+	"github.com/argoproj/argo-cd/v2/util/env"
 )
 
-const maxCookieLength = 4093
+const (
+	maxCookieLength = 4093
+
+	// limit size of the resp to 512KB
+	respReadLimit       = int64(524288)
+	retryWaitMax        = time.Duration(10) * time.Second
+	EnvRetryMax         = "ARGOCD_K8SCLIENT_RETRY_MAX"
+	EnvRetryBaseBackoff = "ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF"
+)
 
 // max number of chunks a cookie can be broken into. To be compatible with
 // widest range of browsers, you shouldn't create more than 30 cookies per domain
@@ -160,3 +171,71 @@ func (rt *TransportWithHeader) RoundTrip(r *http.Request) (*http.Response, error
 	}
 	return rt.RoundTripper.RoundTrip(r)
 }
+
+func WithRetry(maxRetries int64, baseRetryBackoff time.Duration) transport.WrapperFunc {
+	return func(rt http.RoundTripper) http.RoundTripper {
+		return &retryTransport{
+			inner:      rt,
+			maxRetries: maxRetries,
+			backoff:    baseRetryBackoff,
+		}
+	}
+}
+
+type retryTransport struct {
+	inner      http.RoundTripper
+	maxRetries int64
+	backoff    time.Duration
+}
+
+func isRetriable(resp *http.Response) bool {
+	if resp == nil {
+		return false
+	}
+	if resp.StatusCode == http.StatusTooManyRequests {
+		return true
+	}
+	if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) {
+		return true
+	}
+	return false
+}
+
+func (t *retryTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+	var resp *http.Response
+	var err error
+	backoff := t.backoff
+	var bodyBytes []byte
+	if req.Body != nil {
+		bodyBytes, _ = io.ReadAll(req.Body)
+	}
+	for i := 0; i <= int(t.maxRetries); i++ {
+		req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
+		resp, err = t.inner.RoundTrip(req)
+		if i < int(t.maxRetries) && (err != nil || isRetriable(resp)) {
+			if resp != nil && resp.Body != nil {
+				drainBody(resp.Body)
+			}
+			if backoff > retryWaitMax {
+				backoff = retryWaitMax
+			}
+			select {
+			case <-time.After(backoff):
+			case <-req.Context().Done():
+				return nil, req.Context().Err()
+			}
+			backoff *= 2
+			continue
+		}
+		break
+	}
+	return resp, err
+}
+
+func drainBody(body io.ReadCloser) {
+	defer body.Close()
+	_, err := io.Copy(io.Discard, io.LimitReader(body, respReadLimit))
+	if err != nil {
+		log.Warnf("error reading response body: %s", err.Error())
+	}
+}