From c8a1ec997a3867bab135eb64495c0af8ca89c5e9 Mon Sep 17 00:00:00 2001 From: Remington Breeze Date: Fri, 29 Sep 2023 17:41:36 -0700 Subject: [PATCH] fix: only enable dynamic cluster sharding feature explicitly (#15734) * fix: only enable dynamic cluster sharding feature explicitly Signed-off-by: Remington Breeze --------- Signed-off-by: Remington Breeze --- .../commands/argocd_application_controller.go | 54 ++++----- common/common.go | 2 + .../dynamic-cluster-distribution.md | 2 + .../argocd-application-controller.md | 107 +++++++++--------- 4 files changed, 86 insertions(+), 79 deletions(-) diff --git a/cmd/argocd-application-controller/commands/argocd_application_controller.go b/cmd/argocd-application-controller/commands/argocd_application_controller.go index f65a7210e9d4a..a43174633b02a 100644 --- a/cmd/argocd-application-controller/commands/argocd_application_controller.go +++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go @@ -45,28 +45,29 @@ const ( func NewCommand() *cobra.Command { var ( - clientConfig clientcmd.ClientConfig - appResyncPeriod int64 - appHardResyncPeriod int64 - repoServerAddress string - repoServerTimeoutSeconds int - selfHealTimeoutSeconds int - statusProcessors int - operationProcessors int - glogLevel int - metricsPort int - metricsCacheExpiration time.Duration - metricsAplicationLabels []string - kubectlParallelismLimit int64 - cacheSource func() (*appstatecache.Cache, error) - redisClient *redis.Client - repoServerPlaintext bool - repoServerStrictTLS bool - otlpAddress string - otlpAttrs []string - applicationNamespaces []string - persistResourceHealth bool - shardingAlgorithm string + clientConfig clientcmd.ClientConfig + appResyncPeriod int64 + appHardResyncPeriod int64 + repoServerAddress string + repoServerTimeoutSeconds int + selfHealTimeoutSeconds int + statusProcessors int + operationProcessors int + glogLevel int + metricsPort int + metricsCacheExpiration time.Duration + metricsAplicationLabels []string + kubectlParallelismLimit int64 + cacheSource func() (*appstatecache.Cache, error) + redisClient *redis.Client + repoServerPlaintext bool + repoServerStrictTLS bool + otlpAddress string + otlpAttrs []string + applicationNamespaces []string + persistResourceHealth bool + shardingAlgorithm string + enableDynamicClusterDistribution bool ) var command = cobra.Command{ Use: cliName, @@ -139,7 +140,7 @@ func NewCommand() *cobra.Command { appController.InvalidateProjectsCache() })) kubectl := kubeutil.NewKubectl() - clusterFilter := getClusterFilter(kubeClient, settingsMgr, shardingAlgorithm) + clusterFilter := getClusterFilter(kubeClient, settingsMgr, shardingAlgorithm, enableDynamicClusterDistribution) errors.CheckError(err) appController, err = controller.NewApplicationController( namespace, @@ -204,13 +205,14 @@ func NewCommand() *cobra.Command { command.Flags().StringSliceVar(&applicationNamespaces, "application-namespaces", env.StringsFromEnv("ARGOCD_APPLICATION_NAMESPACES", []string{}, ","), "List of additional namespaces that applications are allowed to be reconciled from") command.Flags().BoolVar(&persistResourceHealth, "persist-resource-health", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH", true), "Enables storing the managed resources health in the Application CRD") command.Flags().StringVar(&shardingAlgorithm, "sharding-method", env.StringFromEnv(common.EnvControllerShardingAlgorithm, common.DefaultShardingAlgorithm), "Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin] ") + command.Flags().BoolVar(&enableDynamicClusterDistribution, "dynamic-cluster-distribution-enabled", env.ParseBoolFromEnv(common.EnvEnableDynamicClusterDistribution, false), "Enables dynamic cluster distribution.") cacheSource = appstatecache.AddCacheFlagsToCmd(&command, func(client *redis.Client) { redisClient = client }) return &command } -func getClusterFilter(kubeClient *kubernetes.Clientset, settingsMgr *settings.SettingsManager, shardingAlgorithm string) sharding.ClusterFilterFunction { +func getClusterFilter(kubeClient *kubernetes.Clientset, settingsMgr *settings.SettingsManager, shardingAlgorithm string, enableDynamicClusterDistribution bool) sharding.ClusterFilterFunction { var replicas int shard := env.ParseNumFromEnv(common.EnvControllerShard, -1, -math.MaxInt32, math.MaxInt32) @@ -223,7 +225,7 @@ func getClusterFilter(kubeClient *kubernetes.Clientset, settingsMgr *settings.Se appControllerDeployment = nil } - if appControllerDeployment != nil && appControllerDeployment.Spec.Replicas != nil { + if enableDynamicClusterDistribution && appControllerDeployment != nil && appControllerDeployment.Spec.Replicas != nil { replicas = int(*appControllerDeployment.Spec.Replicas) } else { replicas = env.ParseNumFromEnv(common.EnvControllerReplicas, 0, 0, math.MaxInt32) @@ -233,7 +235,7 @@ func getClusterFilter(kubeClient *kubernetes.Clientset, settingsMgr *settings.Se if replicas > 1 { // check for shard mapping using configmap if application-controller is a deployment // else use existing logic to infer shard from pod name if application-controller is a statefulset - if appControllerDeployment != nil { + if enableDynamicClusterDistribution && appControllerDeployment != nil { var err error // retry 3 times if we find a conflict while updating shard mapping configMap. diff --git a/common/common.go b/common/common.go index 59e2d7b8474ab..d7c2d24738b58 100644 --- a/common/common.go +++ b/common/common.go @@ -224,6 +224,8 @@ const ( EnvControllerShard = "ARGOCD_CONTROLLER_SHARD" // EnvControllerShardingAlgorithm is the distribution sharding algorithm to be used: legacy or round-robin EnvControllerShardingAlgorithm = "ARGOCD_CONTROLLER_SHARDING_ALGORITHM" + //EnvEnableDynamicClusterDistribution enables dynamic sharding (ALPHA) + EnvEnableDynamicClusterDistribution = "ARGOCD_ENABLE_DYNAMIC_CLUSTER_DISTRIBUTION" // EnvEnableGRPCTimeHistogramEnv enables gRPC metrics collection EnvEnableGRPCTimeHistogramEnv = "ARGOCD_ENABLE_GRPC_TIME_HISTOGRAM" // EnvGithubAppCredsExpirationDuration controls the caching of Github app credentials. This value is in minutes (default: 60) diff --git a/docs/operator-manual/dynamic-cluster-distribution.md b/docs/operator-manual/dynamic-cluster-distribution.md index b07165ae0219a..a32258c3f2f0a 100644 --- a/docs/operator-manual/dynamic-cluster-distribution.md +++ b/docs/operator-manual/dynamic-cluster-distribution.md @@ -17,6 +17,8 @@ which does not require a restart of the application controller pods. ## Enabling Dynamic Distribution of Clusters +This feature is disabled by default while it is in alpha. To enable it, you must set the environment `ARGOCD_ENABLE_DYNAMIC_CLUSTER_DISTRIBUTION` to true when running the Application Controller. + In order to utilize the feature, the manifests `manifests/ha/base/controller-deployment/` can be applied as a Kustomize overlay. This overlay sets the StatefulSet replicas to `0` and deploys the application controller as a Deployment. The dynamic distribution code automatically kicks in when the controller is deployed as a Deployment. diff --git a/docs/operator-manual/server-commands/argocd-application-controller.md b/docs/operator-manual/server-commands/argocd-application-controller.md index 64957c4a36e1e..21d26b29c572e 100644 --- a/docs/operator-manual/server-commands/argocd-application-controller.md +++ b/docs/operator-manual/server-commands/argocd-application-controller.md @@ -15,58 +15,59 @@ argocd-application-controller [flags] ### Options ``` - --app-hard-resync int Time period in seconds for application hard resync. - --app-resync int Time period in seconds for application resync. (default 180) - --app-state-cache-expiration duration Cache expiration for app state (default 1h0m0s) - --application-namespaces strings List of additional namespaces that applications are allowed to be reconciled from - --as string Username to impersonate for the operation - --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. - --as-uid string UID to impersonate for the operation - --certificate-authority string Path to a cert file for the certificate authority - --client-certificate string Path to a client certificate file for TLS - --client-key string Path to a client key file for TLS - --cluster string The name of the kubeconfig cluster to use - --context string The name of the kubeconfig context to use - --default-cache-expiration duration Cache expiration default (default 24h0m0s) - --gloglevel int Set the glog logging level - -h, --help help for argocd-application-controller - --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure - --kubeconfig string Path to a kube config. Only required if out-of-cluster - --kubectl-parallelism-limit int Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit. (default 20) - --logformat string Set the logging format. One of: text|json (default "text") - --loglevel string Set the logging level. One of: debug|info|warn|error (default "info") - --metrics-application-labels strings List of Application labels that will be added to the argocd_application_labels metric - --metrics-cache-expiration duration Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s) - --metrics-port int Start metrics server on given port (default 8082) - -n, --namespace string If present, the namespace scope for this CLI request - --operation-processors int Number of application operation processors (default 10) - --otlp-address string OpenTelemetry collector address to send traces to - --otlp-attrs strings List of OpenTelemetry collector extra attrs when send traces, each attribute is separated by a colon(e.g. key:value) - --password string Password for basic authentication to the API server - --persist-resource-health Enables storing the managed resources health in the Application CRD (default true) - --proxy-url string If provided, this URL will be used to connect via proxy - --redis string Redis server hostname and port (e.g. argocd-redis:6379). - --redis-ca-certificate string Path to Redis server CA certificate (e.g. /etc/certs/redis/ca.crt). If not specified, system trusted CAs will be used for server certificate validation. - --redis-client-certificate string Path to Redis client certificate (e.g. /etc/certs/redis/client.crt). - --redis-client-key string Path to Redis client key (e.g. /etc/certs/redis/client.crt). - --redis-compress string Enable compression for data sent to Redis with the required compression algorithm. (possible values: gzip, none) (default "gzip") - --redis-insecure-skip-tls-verify Skip Redis server certificate validation. - --redis-use-tls Use TLS when connecting to Redis. - --redisdb int Redis database. - --repo-server string Repo server address. (default "argocd-repo-server:8081") - --repo-server-plaintext Disable TLS on connections to repo server - --repo-server-strict-tls Whether to use strict validation of the TLS cert presented by the repo server - --repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60) - --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") - --self-heal-timeout-seconds int Specifies timeout between application self heal attempts (default 5) - --sentinel stringArray Redis sentinel hostname and port (e.g. argocd-redis-ha-announce-0:6379). - --sentinelmaster string Redis sentinel master group name. (default "master") - --server string The address and port of the Kubernetes API server - --sharding-method string Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin] (default "legacy") - --status-processors int Number of application status processors (default 20) - --tls-server-name string If provided, this name will be used to validate server certificate. If this is not provided, hostname used to contact the server is used. - --token string Bearer token for authentication to the API server - --user string The name of the kubeconfig user to use - --username string Username for basic authentication to the API server + --app-hard-resync int Time period in seconds for application hard resync. + --app-resync int Time period in seconds for application resync. (default 180) + --app-state-cache-expiration duration Cache expiration for app state (default 1h0m0s) + --application-namespaces strings List of additional namespaces that applications are allowed to be reconciled from + --as string Username to impersonate for the operation + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --default-cache-expiration duration Cache expiration default (default 24h0m0s) + --dynamic-cluster-distribution-enabled Enables dynamic cluster distribution. + --gloglevel int Set the glog logging level + -h, --help help for argocd-application-controller + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to a kube config. Only required if out-of-cluster + --kubectl-parallelism-limit int Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit. (default 20) + --logformat string Set the logging format. One of: text|json (default "text") + --loglevel string Set the logging level. One of: debug|info|warn|error (default "info") + --metrics-application-labels strings List of Application labels that will be added to the argocd_application_labels metric + --metrics-cache-expiration duration Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s) + --metrics-port int Start metrics server on given port (default 8082) + -n, --namespace string If present, the namespace scope for this CLI request + --operation-processors int Number of application operation processors (default 10) + --otlp-address string OpenTelemetry collector address to send traces to + --otlp-attrs strings List of OpenTelemetry collector extra attrs when send traces, each attribute is separated by a colon(e.g. key:value) + --password string Password for basic authentication to the API server + --persist-resource-health Enables storing the managed resources health in the Application CRD (default true) + --proxy-url string If provided, this URL will be used to connect via proxy + --redis string Redis server hostname and port (e.g. argocd-redis:6379). + --redis-ca-certificate string Path to Redis server CA certificate (e.g. /etc/certs/redis/ca.crt). If not specified, system trusted CAs will be used for server certificate validation. + --redis-client-certificate string Path to Redis client certificate (e.g. /etc/certs/redis/client.crt). + --redis-client-key string Path to Redis client key (e.g. /etc/certs/redis/client.crt). + --redis-compress string Enable compression for data sent to Redis with the required compression algorithm. (possible values: gzip, none) (default "gzip") + --redis-insecure-skip-tls-verify Skip Redis server certificate validation. + --redis-use-tls Use TLS when connecting to Redis. + --redisdb int Redis database. + --repo-server string Repo server address. (default "argocd-repo-server:8081") + --repo-server-plaintext Disable TLS on connections to repo server + --repo-server-strict-tls Whether to use strict validation of the TLS cert presented by the repo server + --repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60) + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + --self-heal-timeout-seconds int Specifies timeout between application self heal attempts (default 5) + --sentinel stringArray Redis sentinel hostname and port (e.g. argocd-redis-ha-announce-0:6379). + --sentinelmaster string Redis sentinel master group name. (default "master") + --server string The address and port of the Kubernetes API server + --sharding-method string Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin] (default "legacy") + --status-processors int Number of application status processors (default 20) + --tls-server-name string If provided, this name will be used to validate server certificate. If this is not provided, hostname used to contact the server is used. + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use + --username string Username for basic authentication to the API server ```