From 5f88febe56da26943e1de9cea009193023f08b9b Mon Sep 17 00:00:00 2001 From: Pavel Date: Fri, 3 Nov 2023 01:07:50 +0800 Subject: [PATCH] feat: add retry logic for k8s client #7692 (#16154) * add retry logic for k8s client Signed-off-by: Pavel Aborilov * add docs for retry logic and envs to manifests Signed-off-by: Pavel Aborilov --------- Signed-off-by: Pavel Aborilov Signed-off-by: Pavel --- .../operator-manual/argocd-cmd-params-cm.yaml | 10 +++ docs/operator-manual/high_availability.md | 88 +++++++++++++++++++ ...cd-application-controller-statefulset.yaml | 12 +++ .../base/server/argocd-server-deployment.yaml | 12 +++ manifests/core-install.yaml | 12 +++ .../ha/base/redis-ha/chart/upstream.yaml | 52 ++--------- manifests/ha/install.yaml | 24 +++++ manifests/ha/namespace-install.yaml | 24 +++++ manifests/install.yaml | 24 +++++ manifests/namespace-install.yaml | 24 +++++ pkg/apis/application/v1alpha1/types.go | 10 ++- util/http/http.go | 87 +++++++++++++++++- 12 files changed, 327 insertions(+), 52 deletions(-) diff --git a/docs/operator-manual/argocd-cmd-params-cm.yaml b/docs/operator-manual/argocd-cmd-params-cm.yaml index c752f342dcf50..22c6e3762e69a 100644 --- a/docs/operator-manual/argocd-cmd-params-cm.yaml +++ b/docs/operator-manual/argocd-cmd-params-cm.yaml @@ -58,6 +58,12 @@ data: controller.sharding.algorithm: legacy # Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit. controller.kubectl.parallelism.limit: "20" + # The maximum number of retries for each request + controller.k8sclient.retry.max: "0" + # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold + controller.k8sclient.retry.base.backoff: "100" + # Grace period in seconds for ignoring consecutive errors while communicating with repo server. + controller.repo.error.grace.period.seconds: "180" ## Server properties # Listen on given address for incoming connections (default "0.0.0.0") @@ -75,6 +81,10 @@ data: # Semicolon-separated list of content types allowed on non-GET requests. Set an empty string to allow all. Be aware # that allowing content types besides application/json may make your API more vulnerable to CSRF attacks. server.api.content.types: "application/json" + # The maximum number of retries for each request + server.k8sclient.retry.max: "0" + # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold + server.k8sclient.retry.base.backoff: "100" # Set the logging format. One of: text|json (default "text") server.log.format: "text" diff --git a/docs/operator-manual/high_availability.md b/docs/operator-manual/high_availability.md index eaa000b5d96d5..ef3964e2761a5 100644 --- a/docs/operator-manual/high_availability.md +++ b/docs/operator-manual/high_availability.md @@ -229,3 +229,91 @@ spec: path: my-application # ... ``` + +## Rate Limiting Application Reconciliations + +To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors, +we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured: + + * Global rate limits + * Per item rate limits + +The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`. + +### Global rate limits + + This is enabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second. +This is useful to prevent a large number of apps from being queued at the same time. + +To configure the bucket limiter you can set the following environment variables: + + * `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500. + * `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to 50. + +### Per item rate limits + + This by default returns a fixed base delay/backoff value but can be configured to return exponential values, read further to understand it's working. +Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially +if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued. + +To configure the per item limiter you can set the following environment variables: + + * `WORKQUEUE_FAILURE_COOLDOWN_NS` : The cool down period in nanoseconds, once period has elapsed for an item the backoff is reset. Exponential backoff is disabled if set to 0(default), eg. values : 10 * 10^9 (=10s) + * `WORKQUEUE_BASE_DELAY_NS` : The base delay in nanoseconds, this is the initial backoff used in the exponential backoff formula. Defaults to 1000 (=1μs) + * `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s) + * `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5 + +The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued +and `lastRequeueTime` is the time at which the item was last queued: + +- When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 : + +``` +backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ? + WORKQUEUE_BASE_DELAY_NS : + min( + WORKQUEUE_MAX_DELAY_NS, + WORKQUEUE_BASE_DELAY_NS * WORKQUEUE_BACKOFF_FACTOR ^ (numRequeue) + ) +``` + +- When `WORKQUEUE_FAILURE_COOLDOWN_NS` = 0 : + +``` +backoff = WORKQUEUE_BASE_DELAY_NS +``` + +## HTTP Request Retry Strategy + +In scenarios where network instability or transient server errors occur, the retry strategy ensures the robustness of HTTP communication by automatically resending failed requests. It uses a combination of maximum retries and backoff intervals to prevent overwhelming the server or thrashing the network. + +### Configuring Retries + +The retry logic can be fine-tuned with the following environment variables: + +* `ARGOCD_K8SCLIENT_RETRY_MAX` - The maximum number of retries for each request. The request will be dropped after this count is reached. Defaults to 0 (no retries). +* `ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF` - The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold. Defaults to 100ms. + +### Backoff Strategy + +The backoff strategy employed is a simple exponential backoff without jitter. The backoff time increases exponentially with each retry attempt until a maximum backoff duration is reached. + +The formula for calculating the backoff time is: + +``` +backoff = min(retryWaitMax, baseRetryBackoff * (2 ^ retryAttempt)) +``` +Where `retryAttempt` starts at 0 and increments by 1 for each subsequent retry. + +### Maximum Wait Time + +There is a cap on the backoff time to prevent excessive wait times between retries. This cap is defined by: + +`retryWaitMax` - The maximum duration to wait before retrying. This ensures that retries happen within a reasonable timeframe. Defaults to 10 seconds. + +### Non-Retriable Conditions + +Not all HTTP responses are eligible for retries. The following conditions will not trigger a retry: + +* Responses with a status code indicating client errors (4xx) except for 429 Too Many Requests. +* Responses with the status code 501 Not Implemented. diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index c1573ec3a3501..53036177b934d 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -161,6 +161,18 @@ spec: name: argocd-cmd-params-cm key: controller.ignore.normalizer.jq.timeout optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.k8sclient.retry.max + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.k8sclient.retry.base.backoff + optional: true image: quay.io/argoproj/argocd:latest imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/base/server/argocd-server-deployment.yaml b/manifests/base/server/argocd-server-deployment.yaml index 45b2cdb8df285..20b59f1b77ccb 100644 --- a/manifests/base/server/argocd-server-deployment.yaml +++ b/manifests/base/server/argocd-server-deployment.yaml @@ -233,6 +233,18 @@ spec: name: argocd-cmd-params-cm key: server.api.content.types optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: server.k8sclient.retry.max + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: server.k8sclient.retry.base.backoff + optional: true volumeMounts: - name: ssh-known-hosts mountPath: /app/config/ssh diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index 181e046be180e..3113115d15114 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -19451,6 +19451,18 @@ spec: key: controller.ignore.normalizer.jq.timeout name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/ha/base/redis-ha/chart/upstream.yaml b/manifests/ha/base/redis-ha/chart/upstream.yaml index a94bba0f09062..ca6fb9f737473 100644 --- a/manifests/ha/base/redis-ha/chart/upstream.yaml +++ b/manifests/ha/base/redis-ha/chart/upstream.yaml @@ -1080,13 +1080,7 @@ spec: args: - /readonly/haproxy_init.sh securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault + null volumeMounts: - name: config-volume mountPath: /readonly @@ -1098,13 +1092,7 @@ spec: image: haproxy:2.6.14-alpine imagePullPolicy: IfNotPresent securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault + null livenessProbe: httpGet: path: /healthz @@ -1200,14 +1188,7 @@ spec: args: - /readonly-config/init.sh securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault + null env: - name: SENTINEL_ID_0 value: 3c0d9c0320bb34888c2df5757c718ce6ca992ce6 @@ -1232,14 +1213,7 @@ spec: args: - /data/conf/redis.conf securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault + null livenessProbe: initialDelaySeconds: 30 periodSeconds: 15 @@ -1289,14 +1263,7 @@ spec: args: - /data/conf/sentinel.conf securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault + null livenessProbe: initialDelaySeconds: 30 periodSeconds: 15 @@ -1340,14 +1307,7 @@ spec: args: - /readonly-config/fix-split-brain.sh securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault + null env: - name: SENTINEL_ID_0 value: 3c0d9c0320bb34888c2df5757c718ce6ca992ce6 diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index 021539b1f0d15..535f5e54d0dc0 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -20995,6 +20995,18 @@ spec: key: server.api.content.types name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always livenessProbe: @@ -21247,6 +21259,18 @@ spec: key: controller.ignore.normalizer.jq.timeout name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index adde5cc188fd5..f51e5e2bb531f 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -2507,6 +2507,18 @@ spec: key: server.api.content.types name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always livenessProbe: @@ -2759,6 +2771,18 @@ spec: key: controller.ignore.normalizer.jq.timeout name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/install.yaml b/manifests/install.yaml index 2f9e0078141f1..673155d3ac658 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -20050,6 +20050,18 @@ spec: key: server.api.content.types name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always livenessProbe: @@ -20302,6 +20314,18 @@ spec: key: controller.ignore.normalizer.jq.timeout name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 74d395205f07a..1751b07523037 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -1562,6 +1562,18 @@ spec: key: server.api.content.types name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always livenessProbe: @@ -1814,6 +1826,18 @@ spec: key: controller.ignore.normalizer.jq.timeout name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.17 imagePullPolicy: Always name: argocd-application-controller diff --git a/pkg/apis/application/v1alpha1/types.go b/pkg/apis/application/v1alpha1/types.go index 2eb5166c7d1ea..e37a12b656c65 100644 --- a/pkg/apis/application/v1alpha1/types.go +++ b/pkg/apis/application/v1alpha1/types.go @@ -35,11 +35,11 @@ import ( "k8s.io/client-go/tools/clientcmd/api" "sigs.k8s.io/yaml" - "github.com/argoproj/argo-cd/v2/util/env" - "github.com/argoproj/argo-cd/v2/common" "github.com/argoproj/argo-cd/v2/util/collections" + "github.com/argoproj/argo-cd/v2/util/env" "github.com/argoproj/argo-cd/v2/util/helm" + utilhttp "github.com/argoproj/argo-cd/v2/util/http" "github.com/argoproj/argo-cd/v2/util/security" ) @@ -2850,6 +2850,12 @@ func SetK8SConfigDefaults(config *rest.Config) error { config.Timeout = K8sServerSideTimeout config.Transport = tr + maxRetries := env.ParseInt64FromEnv(utilhttp.EnvRetryMax, 0, 1, math.MaxInt64) + if maxRetries > 0 { + backoffDurationMS := env.ParseInt64FromEnv(utilhttp.EnvRetryBaseBackoff, 100, 1, math.MaxInt64) + backoffDuration := time.Duration(backoffDurationMS) * time.Millisecond + config.WrapTransport = utilhttp.WithRetry(maxRetries, backoffDuration) + } return nil } diff --git a/util/http/http.go b/util/http/http.go index 919e57a89e296..7c13c71fde223 100644 --- a/util/http/http.go +++ b/util/http/http.go @@ -1,21 +1,32 @@ package http import ( + "bytes" "fmt" + "io" "math" "net/http" "net/http/httputil" "strconv" "strings" + "time" - "github.com/argoproj/argo-cd/v2/util/env" + log "github.com/sirupsen/logrus" + "k8s.io/client-go/transport" "github.com/argoproj/argo-cd/v2/common" - - log "github.com/sirupsen/logrus" + "github.com/argoproj/argo-cd/v2/util/env" ) -const maxCookieLength = 4093 +const ( + maxCookieLength = 4093 + + // limit size of the resp to 512KB + respReadLimit = int64(524288) + retryWaitMax = time.Duration(10) * time.Second + EnvRetryMax = "ARGOCD_K8SCLIENT_RETRY_MAX" + EnvRetryBaseBackoff = "ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF" +) // max number of chunks a cookie can be broken into. To be compatible with // widest range of browsers, you shouldn't create more than 30 cookies per domain @@ -160,3 +171,71 @@ func (rt *TransportWithHeader) RoundTrip(r *http.Request) (*http.Response, error } return rt.RoundTripper.RoundTrip(r) } + +func WithRetry(maxRetries int64, baseRetryBackoff time.Duration) transport.WrapperFunc { + return func(rt http.RoundTripper) http.RoundTripper { + return &retryTransport{ + inner: rt, + maxRetries: maxRetries, + backoff: baseRetryBackoff, + } + } +} + +type retryTransport struct { + inner http.RoundTripper + maxRetries int64 + backoff time.Duration +} + +func isRetriable(resp *http.Response) bool { + if resp == nil { + return false + } + if resp.StatusCode == http.StatusTooManyRequests { + return true + } + if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) { + return true + } + return false +} + +func (t *retryTransport) RoundTrip(req *http.Request) (*http.Response, error) { + var resp *http.Response + var err error + backoff := t.backoff + var bodyBytes []byte + if req.Body != nil { + bodyBytes, _ = io.ReadAll(req.Body) + } + for i := 0; i <= int(t.maxRetries); i++ { + req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) + resp, err = t.inner.RoundTrip(req) + if i < int(t.maxRetries) && (err != nil || isRetriable(resp)) { + if resp != nil && resp.Body != nil { + drainBody(resp.Body) + } + if backoff > retryWaitMax { + backoff = retryWaitMax + } + select { + case <-time.After(backoff): + case <-req.Context().Done(): + return nil, req.Context().Err() + } + backoff *= 2 + continue + } + break + } + return resp, err +} + +func drainBody(body io.ReadCloser) { + defer body.Close() + _, err := io.Copy(io.Discard, io.LimitReader(body, respReadLimit)) + if err != nil { + log.Warnf("error reading response body: %s", err.Error()) + } +}