diff --git a/cmd/argocd-application-controller/commands/argocd_application_controller.go b/cmd/argocd-application-controller/commands/argocd_application_controller.go index ddf6a978a42aa..75e3979ba86e3 100644 --- a/cmd/argocd-application-controller/commands/argocd_application_controller.go +++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go @@ -62,6 +62,7 @@ func NewCommand() *cobra.Command { selfHealBackoffTimeoutSeconds int selfHealBackoffFactor int selfHealBackoffCapSeconds int + syncTimeout int statusProcessors int operationProcessors int glogLevel int @@ -189,6 +190,7 @@ func NewCommand() *cobra.Command { time.Duration(appResyncJitter)*time.Second, time.Duration(selfHealTimeoutSeconds)*time.Second, selfHealBackoff, + time.Duration(syncTimeout)*time.Second, time.Duration(repoErrorGracePeriod)*time.Second, metricsPort, metricsCacheExpiration, @@ -256,6 +258,7 @@ func NewCommand() *cobra.Command { command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts") command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts") command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts") + command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).") command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.") command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server") command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server") diff --git a/controller/appcontroller.go b/controller/appcontroller.go index a5f342763405e..16b9bc9753090 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -131,6 +131,7 @@ type ApplicationController struct { statusRefreshJitter time.Duration selfHealTimeout time.Duration selfHealBackOff *wait.Backoff + syncTimeout time.Duration db db.ArgoDB settingsMgr *settings_util.SettingsManager refreshRequestedApps map[string]CompareWith @@ -161,6 +162,7 @@ func NewApplicationController( appResyncJitter time.Duration, selfHealTimeout time.Duration, selfHealBackoff *wait.Backoff, + syncTimeout time.Duration, repoErrorGracePeriod time.Duration, metricsPort int, metricsCacheExpiration time.Duration, @@ -202,6 +204,7 @@ func NewApplicationController( settingsMgr: settingsMgr, selfHealTimeout: selfHealTimeout, selfHealBackOff: selfHealBackoff, + syncTimeout: syncTimeout, clusterSharding: clusterSharding, projByNameCache: sync.Map{}, applicationNamespaces: applicationNamespaces, @@ -1373,12 +1376,21 @@ func (ctrl *ApplicationController) processRequestedAppOperation(app *appv1.Appli // Get rid of sync results and null out previous operation completion time state.SyncResult = nil } + } else if ctrl.syncTimeout != time.Duration(0) && time.Now().After(state.StartedAt.Add(ctrl.syncTimeout)) && !terminating { + state.Phase = synccommon.OperationTerminating + state.Message = "operation is terminating due to timeout" + ctrl.setOperationState(app, state) + logCtx.Infof("Terminating in-progress operation due to timeout. Started at: %v, timeout: %v", state.StartedAt, ctrl.syncTimeout) } else { logCtx.Infof("Resuming in-progress operation. phase: %s, message: %s", state.Phase, state.Message) } } else { state = &appv1.OperationState{Phase: synccommon.OperationRunning, Operation: *app.Operation, StartedAt: metav1.Now()} ctrl.setOperationState(app, state) + if ctrl.syncTimeout != time.Duration(0) { + // Schedule a check during which the timeout would be checked. + ctrl.appOperationQueue.AddAfter(ctrl.toAppKey(app.QualifiedName()), ctrl.syncTimeout) + } logCtx.Infof("Initialized new operation: %v", *app.Operation) } ts.AddCheckpoint("initial_operation_stage_ms") diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index 00aec780a5f4b..a515f36583edc 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -167,6 +167,7 @@ func newFakeControllerWithResync(data *fakeData, appResyncPeriod time.Duration, time.Second, time.Minute, nil, + 0, time.Second*10, common.DefaultPortArgoCDMetrics, data.metricsCacheExpiration, @@ -2481,3 +2482,54 @@ func TestSelfHealExponentialBackoff(t *testing.T) { }) } } + +func TestSyncTimeout(t *testing.T) { + testCases := []struct { + delta time.Duration + expectedPhase synccommon.OperationPhase + expectedMessage string + }{{ + delta: 2 * time.Minute, + expectedPhase: synccommon.OperationFailed, + expectedMessage: "Operation terminated", + }, { + delta: 30 * time.Second, + expectedPhase: synccommon.OperationSucceeded, + expectedMessage: "successfully synced (no more tasks)", + }} + for i := range testCases { + tc := testCases[i] + t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { + app := newFakeApp() + app.Spec.Project = "default" + app.Operation = &v1alpha1.Operation{ + Sync: &v1alpha1.SyncOperation{ + Revision: "HEAD", + }, + } + ctrl := newFakeController(&fakeData{ + apps: []runtime.Object{app, &defaultProj}, + manifestResponses: []*apiclient.ManifestResponse{{ + Manifests: []string{}, + }}, + }, nil) + + ctrl.syncTimeout = time.Minute + app.Status.OperationState = &v1alpha1.OperationState{ + Operation: v1alpha1.Operation{ + Sync: &v1alpha1.SyncOperation{ + Revision: "HEAD", + }, + }, + Phase: synccommon.OperationRunning, + StartedAt: metav1.NewTime(time.Now().Add(-tc.delta)), + } + ctrl.processRequestedAppOperation(app) + + app, err := ctrl.applicationClientset.ArgoprojV1alpha1().Applications(app.ObjectMeta.Namespace).Get(context.Background(), app.ObjectMeta.Name, metav1.GetOptions{}) + require.NoError(t, err) + require.Equal(t, tc.expectedPhase, app.Status.OperationState.Phase) + require.Equal(t, tc.expectedMessage, app.Status.OperationState.Message) + }) + } +} diff --git a/docs/operator-manual/argocd-cmd-params-cm.yaml b/docs/operator-manual/argocd-cmd-params-cm.yaml index 7abbdb3e37a17..2ff8cd31b9ae2 100644 --- a/docs/operator-manual/argocd-cmd-params-cm.yaml +++ b/docs/operator-manual/argocd-cmd-params-cm.yaml @@ -51,6 +51,8 @@ data: controller.self.heal.timeout.seconds: "2" controller.self.heal.backoff.factor: "3" controller.self.heal.backoff.cap.seconds: "300" + # Specifies a sync timeout for applications. "0" means no timeout (default "0") + controller.sync.timeout.seconds: "0" # Cache expiration for app state (default 1h0m0s) controller.app.state.cache.expiration: "1h0m0s" diff --git a/docs/operator-manual/server-commands/argocd-application-controller.md b/docs/operator-manual/server-commands/argocd-application-controller.md index 07fd9e545c8d4..0fc41c034d449 100644 --- a/docs/operator-manual/server-commands/argocd-application-controller.md +++ b/docs/operator-manual/server-commands/argocd-application-controller.md @@ -77,6 +77,7 @@ argocd-application-controller [flags] --server-side-diff-enabled Feature flag to enable ServerSide diff. Default ("false") --sharding-method string Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin, consistent-hashing] (default "legacy") --status-processors int Number of application status processors (default 20) + --sync-timeout int Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0). --tls-server-name string If provided, this name will be used to validate server certificate. If this is not provided, hostname used to contact the server is used. --token string Bearer token for authentication to the API server --user string The name of the kubeconfig user to use diff --git a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml index 3398ece8ef8d3..77ef1c3a3fd6b 100644 --- a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml +++ b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml @@ -115,6 +115,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.sync.timeout.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index 9e07525a27982..6162dfb2d27fc 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -118,6 +118,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.sync.timeout.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index 00bf5457e81fb..8854c5c25d40f 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -23657,6 +23657,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index 39d75b0ac25c5..b0cacfca0a4c8 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -25646,6 +25646,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index 6f4231e7d4944..3e3f954a224f6 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -2969,6 +2969,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/install.yaml b/manifests/install.yaml index 0cdf784a2031b..007cdb7ba2027 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -24716,6 +24716,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 73fe685010ae4..b1a099956fd7c 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -2039,6 +2039,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT + valueFrom: + configMapKeyRef: + key: controller.sync.timeout.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT valueFrom: configMapKeyRef: