From ca96116924e1782a5b7c4d83b9901e1f3233ef42 Mon Sep 17 00:00:00 2001 From: Andrii Korotkov Date: Sun, 17 Nov 2024 04:22:25 -0800 Subject: [PATCH] feat: Sync timeouts for applications (#6055) Helps with #6055 Introduces a controller-level configuration for terminating sync after timeout. Signed-off-by: Andrii Korotkov --- .../commands/argocd_application_controller.go | 3 ++ controller/appcontroller.go | 12 +++++ controller/appcontroller_test.go | 52 +++++++++++++++++++ .../operator-manual/argocd-cmd-params-cm.yaml | 2 + .../argocd-application-controller.md | 1 + ...ocd-application-controller-deployment.yaml | 6 +++ ...cd-application-controller-statefulset.yaml | 6 +++ manifests/core-install.yaml | 6 +++ manifests/ha/install.yaml | 6 +++ manifests/ha/namespace-install.yaml | 6 +++ manifests/install.yaml | 6 +++ manifests/namespace-install.yaml | 6 +++ 12 files changed, 112 insertions(+) diff --git a/cmd/argocd-application-controller/commands/argocd_application_controller.go b/cmd/argocd-application-controller/commands/argocd_application_controller.go index 0468e4fd473dc..869efd8ce9a97 100644 --- a/cmd/argocd-application-controller/commands/argocd_application_controller.go +++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go @@ -61,6 +61,7 @@ func NewCommand() *cobra.Command { selfHealBackoffTimeoutSeconds int selfHealBackoffFactor int selfHealBackoffCapSeconds int + syncTimeout int statusProcessors int operationProcessors int glogLevel int @@ -181,6 +182,7 @@ func NewCommand() *cobra.Command { time.Duration(appResyncJitter)*time.Second, time.Duration(selfHealTimeoutSeconds)*time.Second, selfHealBackoff, + time.Duration(syncTimeout)*time.Second, time.Duration(repoErrorGracePeriod)*time.Second, metricsPort, metricsCacheExpiration, @@ -248,6 +250,7 @@ func NewCommand() *cobra.Command { command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts") command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts") command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts") + command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNCTIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).") command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.") command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server") command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server") diff --git a/controller/appcontroller.go b/controller/appcontroller.go index 5a48f1d41cb09..29cc2884235c8 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -131,6 +131,7 @@ type ApplicationController struct { statusRefreshJitter time.Duration selfHealTimeout time.Duration selfHealBackOff *wait.Backoff + syncTimeout time.Duration db db.ArgoDB settingsMgr *settings_util.SettingsManager refreshRequestedApps map[string]CompareWith @@ -161,6 +162,7 @@ func NewApplicationController( appResyncJitter time.Duration, selfHealTimeout time.Duration, selfHealBackoff *wait.Backoff, + syncTimeout time.Duration, repoErrorGracePeriod time.Duration, metricsPort int, metricsCacheExpiration time.Duration, @@ -202,6 +204,7 @@ func NewApplicationController( settingsMgr: settingsMgr, selfHealTimeout: selfHealTimeout, selfHealBackOff: selfHealBackoff, + syncTimeout: syncTimeout, clusterSharding: clusterSharding, projByNameCache: sync.Map{}, applicationNamespaces: applicationNamespaces, @@ -1373,12 +1376,21 @@ func (ctrl *ApplicationController) processRequestedAppOperation(app *appv1.Appli // Get rid of sync results and null out previous operation completion time state.SyncResult = nil } + } else if ctrl.syncTimeout != time.Duration(0) && time.Now().After(state.StartedAt.Add(ctrl.syncTimeout)) && !terminating { + state.Phase = synccommon.OperationTerminating + state.Message = "operation is terminating due to timeout" + ctrl.setOperationState(app, state) + logCtx.Infof("Terminating in-progress operation due to timeout. Started at: %v, timeout: %v", state.StartedAt, ctrl.syncTimeout) } else { logCtx.Infof("Resuming in-progress operation. phase: %s, message: %s", state.Phase, state.Message) } } else { state = &appv1.OperationState{Phase: synccommon.OperationRunning, Operation: *app.Operation, StartedAt: metav1.Now()} ctrl.setOperationState(app, state) + if ctrl.syncTimeout != time.Duration(0) { + // Schedule a check during which the timeout would be checked. + ctrl.appOperationQueue.AddAfter(ctrl.toAppKey(app.QualifiedName()), ctrl.syncTimeout) + } logCtx.Infof("Initialized new operation: %v", *app.Operation) } ts.AddCheckpoint("initial_operation_stage_ms") diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index cf2c14c7e0447..00a69a95c7160 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -160,6 +160,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController { time.Second, time.Minute, nil, + 0, time.Second*10, common.DefaultPortArgoCDMetrics, data.metricsCacheExpiration, @@ -2256,3 +2257,54 @@ func TestSelfHealExponentialBackoff(t *testing.T) { }) } } + +func TestSyncTimeout(t *testing.T) { + testCases := []struct { + delta time.Duration + expectedPhase synccommon.OperationPhase + expectedMessage string + }{{ + delta: 2 * time.Minute, + expectedPhase: synccommon.OperationFailed, + expectedMessage: "Operation terminated", + }, { + delta: 30 * time.Second, + expectedPhase: synccommon.OperationSucceeded, + expectedMessage: "successfully synced (no more tasks)", + }} + for i := range testCases { + tc := testCases[i] + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + app := newFakeApp() + app.Spec.Project = "default" + app.Operation = &v1alpha1.Operation{ + Sync: &v1alpha1.SyncOperation{ + Revision: "HEAD", + }, + } + ctrl := newFakeController(&fakeData{ + apps: []runtime.Object{app, &defaultProj}, + manifestResponses: []*apiclient.ManifestResponse{{ + Manifests: []string{}, + }}, + }, nil) + + ctrl.syncTimeout = time.Minute + app.Status.OperationState = &v1alpha1.OperationState{ + Operation: v1alpha1.Operation{ + Sync: &v1alpha1.SyncOperation{ + Revision: "HEAD", + }, + }, + Phase: synccommon.OperationRunning, + StartedAt: metav1.NewTime(time.Now().Add(-tc.delta)), + } + ctrl.processRequestedAppOperation(app) + + app, err := ctrl.applicationClientset.ArgoprojV1alpha1().Applications(app.ObjectMeta.Namespace).Get(context.Background(), app.ObjectMeta.Name, metav1.GetOptions{}) + require.NoError(t, err) + require.Equal(t, tc.expectedPhase, app.Status.OperationState.Phase) + require.Equal(t, tc.expectedMessage, app.Status.OperationState.Message) + }) + } +} diff --git a/docs/operator-manual/argocd-cmd-params-cm.yaml b/docs/operator-manual/argocd-cmd-params-cm.yaml index 37aaadd12a4d4..17a1b9e6de941 100644 --- a/docs/operator-manual/argocd-cmd-params-cm.yaml +++ b/docs/operator-manual/argocd-cmd-params-cm.yaml @@ -51,6 +51,8 @@ data: controller.self.heal.timeout.seconds: "2" controller.self.heal.backoff.factor: "3" controller.self.heal.backoff.cap.seconds: "300" + # Specifies a sync timeout for applications. "0" means no timeout (default "0") + controller.sync.timeout.seconds: "0" # Cache expiration for app state (default 1h0m0s) controller.app.state.cache.expiration: "1h0m0s" diff --git a/docs/operator-manual/server-commands/argocd-application-controller.md b/docs/operator-manual/server-commands/argocd-application-controller.md index 07fd9e545c8d4..0fc41c034d449 100644 --- a/docs/operator-manual/server-commands/argocd-application-controller.md +++ b/docs/operator-manual/server-commands/argocd-application-controller.md @@ -77,6 +77,7 @@ argocd-application-controller [flags] --server-side-diff-enabled Feature flag to enable ServerSide diff. Default ("false") --sharding-method string Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin, consistent-hashing] (default "legacy") --status-processors int Number of application status processors (default 20) + --sync-timeout int Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0). --tls-server-name string If provided, this name will be used to validate server certificate. If this is not provided, hostname used to contact the server is used. --token string Bearer token for authentication to the API server --user string The name of the kubeconfig user to use diff --git a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml index 3398ece8ef8d3..77ef1c3a3fd6b 100644 --- a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml +++ b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml @@ -115,6 +115,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.sync.timeout.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index 9e07525a27982..6162dfb2d27fc 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -118,6 +118,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.sync.timeout.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index c5714d95fc1eb..7ed42b69ce5d3 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -23354,6 +23354,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index aa71c208ed4cc..b878854133128 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -25344,6 +25344,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index 6f4231e7d4944..3e3f954a224f6 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -2969,6 +2969,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/install.yaml b/manifests/install.yaml index 70db9f4a37e9f..534d87520963b 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -24414,6 +24414,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 73fe685010ae4..b1a099956fd7c 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -2039,6 +2039,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: