Skip to content

Commit

Permalink
fix: Application controller can run out of disk space on '/dev/shm' (a…
Browse files Browse the repository at this point in the history
…rgoproj#19201)

Signed-off-by: Jonathan West <[email protected]>
  • Loading branch information
jgwest committed Jul 26, 2024
1 parent 25c2f2f commit bcc5c2d
Show file tree
Hide file tree
Showing 12 changed files with 54 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/argoproj/argo-cd/v2/util/settings"
"github.com/argoproj/argo-cd/v2/util/tls"
"github.com/argoproj/argo-cd/v2/util/trace"
argoio "github.com/argoproj/gitops-engine/pkg/utils/io"
)

const (
Expand Down Expand Up @@ -74,6 +75,7 @@ func NewCommand() *cobra.Command {
enableDynamicClusterDistribution bool
serverSideDiff bool
ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts
manifestGenerationTmpPath string
)
command := cobra.Command{
Use: cliName,
Expand Down Expand Up @@ -172,6 +174,7 @@ func NewCommand() *cobra.Command {
serverSideDiff,
enableDynamicClusterDistribution,
ignoreNormalizerOpts,
manifestGenerationTmpPath,
)
errors.CheckError(err)
cacheutil.CollectMetrics(redisClient, appController.GetMetricsServer())
Expand Down Expand Up @@ -233,6 +236,9 @@ func NewCommand() *cobra.Command {
command.Flags().BoolVar(&enableDynamicClusterDistribution, "dynamic-cluster-distribution-enabled", env.ParseBoolFromEnv(common.EnvEnableDynamicClusterDistribution, false), "Enables dynamic cluster distribution.")
command.Flags().BoolVar(&serverSideDiff, "server-side-diff-enabled", env.ParseBoolFromEnv(common.EnvServerSideDiff, false), "Feature flag to enable ServerSide diff. Default (\"false\")")
command.Flags().DurationVar(&ignoreNormalizerOpts.JQExecutionTimeout, "ignore-normalizer-jq-execution-timeout-seconds", env.ParseDurationFromEnv("ARGOCD_IGNORE_NORMALIZER_JQ_TIMEOUT", 0*time.Second, 0, math.MaxInt64), "Set ignore normalizer JQ execution timeout")

command.Flags().StringVar(&manifestGenerationTmpPath, "manifestGenerationTmpPath", env.StringFromEnv("ARGOCD_APPLICATION_CONTROLLER_MANIFEST_GENERATION_PATH", argoio.TempPathUseDevShmIfAvailable()), "Set the path within application-controller container where Kubernetes manifests will be generated before being passed to kubectl code, default value is \"/dev/shm\"")

cacheSource = appstatecache.AddCacheFlagsToCmd(&command, cacheutil.Options{
OnClientCreated: func(client *redis.Client) {
redisClient = client
Expand Down
3 changes: 2 additions & 1 deletion cmd/argocd/commands/admin/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/argoproj/argo-cd/v2/util/io"
kubeutil "github.com/argoproj/argo-cd/v2/util/kube"
"github.com/argoproj/argo-cd/v2/util/settings"
argoio "github.com/argoproj/gitops-engine/pkg/utils/io"
)

func NewAppCommand(clientOpts *argocdclient.ClientOptions) *cobra.Command {
Expand Down Expand Up @@ -452,5 +453,5 @@ func reconcileApplications(
}

func newLiveStateCache(argoDB db.ArgoDB, appInformer kubecache.SharedIndexInformer, settingsMgr *settings.SettingsManager, server *metrics.MetricsServer) cache.LiveStateCache {
return cache.NewLiveStateCache(argoDB, appInformer, settingsMgr, kubeutil.NewKubectl(), server, func(managedByApp map[string]bool, ref apiv1.ObjectReference) {}, &sharding.ClusterSharding{}, argo.NewResourceTracking())
return cache.NewLiveStateCache(argoDB, appInformer, settingsMgr, kubeutil.NewKubectl(), argoio.TempPathUseDevShmIfAvailable(), server, func(managedByApp map[string]bool, ref apiv1.ObjectReference) {}, &sharding.ClusterSharding{}, argo.NewResourceTracking())
}
4 changes: 3 additions & 1 deletion controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ func NewApplicationController(
serverSideDiff bool,
dynamicClusterDistributionEnabled bool,
ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts,
manifestGenerationTmpPath string,
) (*ApplicationController, error) {
log.Infof("appResyncPeriod=%v, appHardResyncPeriod=%v, appResyncJitter=%v", appResyncPeriod, appHardResyncPeriod, appResyncJitter)
db := db.NewDB(namespace, settingsMgr, kubeClientset)
Expand Down Expand Up @@ -289,7 +290,8 @@ func NewApplicationController(
return nil, err
}
}
stateCache := statecache.NewLiveStateCache(db, appInformer, ctrl.settingsMgr, kubectl, ctrl.metricsServer, ctrl.handleObjectUpdated, clusterSharding, argo.NewResourceTracking())

stateCache := statecache.NewLiveStateCache(db, appInformer, ctrl.settingsMgr, kubectl, manifestGenerationTmpPath, ctrl.metricsServer, ctrl.handleObjectUpdated, clusterSharding, argo.NewResourceTracking())
appStateManager := NewAppStateManager(db, applicationClientset, repoClientset, namespace, kubectl, ctrl.settingsMgr, stateCache, projInformer, ctrl.metricsServer, argoCache, ctrl.statusRefreshTimeout, argo.NewResourceTracking(), persistResourceHealth, repoErrorGracePeriod, serverSideDiff, ignoreNormalizerOpts)
ctrl.appInformer = appInformer
ctrl.appLister = appLister
Expand Down
2 changes: 2 additions & 0 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
cacheutil "github.com/argoproj/argo-cd/v2/util/cache"
appstatecache "github.com/argoproj/argo-cd/v2/util/cache/appstate"
"github.com/argoproj/argo-cd/v2/util/settings"
argoio "github.com/argoproj/gitops-engine/pkg/utils/io"
)

type namespacedResource struct {
Expand Down Expand Up @@ -163,6 +164,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
false,
false,
normalizers.IgnoreNormalizerOpts{},
argoio.TempPathUseDevShmIfAvailable(),
)
db := &dbmocks.ArgoDB{}
db.On("GetApplicationControllerReplicas").Return(1)
Expand Down
24 changes: 15 additions & 9 deletions controller/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,21 +170,23 @@ func NewLiveStateCache(
appInformer cache.SharedIndexInformer,
settingsMgr *settings.SettingsManager,
kubectl kube.Kubectl,
manifestGenTmpPath string,
metricsServer *metrics.MetricsServer,
onObjectUpdated ObjectUpdatedHandler,
clusterSharding sharding.ClusterShardingCache,
resourceTracking argo.ResourceTracking,
) LiveStateCache {
return &liveStateCache{
appInformer: appInformer,
db: db,
clusters: make(map[string]clustercache.ClusterCache),
onObjectUpdated: onObjectUpdated,
kubectl: kubectl,
settingsMgr: settingsMgr,
metricsServer: metricsServer,
clusterSharding: clusterSharding,
resourceTracking: resourceTracking,
appInformer: appInformer,
db: db,
clusters: make(map[string]clustercache.ClusterCache),
onObjectUpdated: onObjectUpdated,
kubectl: kubectl,
settingsMgr: settingsMgr,
metricsServer: metricsServer,
clusterSharding: clusterSharding,
resourceTracking: resourceTracking,
manifestGenTmpPath: manifestGenTmpPath,
}
}

Expand Down Expand Up @@ -213,6 +215,9 @@ type liveStateCache struct {
clusters map[string]clustercache.ClusterCache
cacheSettings cacheSettings
lock sync.RWMutex

// manifestGenTmpPath is the temporary path value passed to gitops-engine, which gitops-engine uses when passing kubernetes manifests/cluster credentials to kubectl code. See 'github.com/argoproj/gitops-engine/pkg/utils/io' for details.
manifestGenTmpPath string
}

func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
Expand Down Expand Up @@ -524,6 +529,7 @@ func (c *liveStateCache) getCluster(server string) (clustercache.ClusterCache, e
clustercache.SetLogr(logutils.NewLogrusLogger(log.WithField("server", cluster.Server))),
clustercache.SetRetryOptions(clusterCacheAttemptLimit, clusterCacheRetryUseBackoff, isRetryableError),
clustercache.SetRespectRBAC(respectRBAC),
clustercache.SetTmpPath(c.manifestGenTmpPath),
}

clusterCache = clustercache.NewClusterCache(clusterCacheConfig, clusterCacheOpts...)
Expand Down
4 changes: 4 additions & 0 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ data:
# diff cache is unavailable).
controller.diff.server.side: "false"

# controller.manifest.generation.path can be used to overwrite the default path (within the application-controller container) used to store the temporary files which are passed to kubectl code via gitops-engine. These temporary files are security sensitive resources, such as generated kubernetes manifests, or cluster credentials.
# Due to security sensitivity, by default '/dev/shm' is used. However, 'controller.manifest.generation.path' allows this value to be customized, for example, using '/tmp' to overcome the limited disk space of '/dev/shm'.
controller.manifest.generation.path: "/dev/shm"

## Server properties
# Listen on given address for incoming connections (default "0.0.0.0")
server.listen.address: "0.0.0.0"
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ require (
)

replace (
github.com/argoproj/gitops-engine => github.com/jgwest/gitops-engine v0.1.3-0.20240726204147-73c025b8bd5a
// https://github.com/golang/go/issues/33546#issuecomment-519656923
github.com/go-check/check => github.com/go-check/check v0.0.0-20180628173108-788fd7840127

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,6 @@ github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb
github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
github.com/appscode/go v0.0.0-20191119085241-0887d8ec2ecc/go.mod h1:OawnOmAL4ZX3YaPdN+8HTNwBveT1jMsqP74moa9XUbE=
github.com/argoproj/gitops-engine v0.7.1-0.20240718175351-6b2984ebc470 h1:RUo6je4n+FgNEkGsONhwxUtT67YqyEtrvMNd+t8pKSo=
github.com/argoproj/gitops-engine v0.7.1-0.20240718175351-6b2984ebc470/go.mod h1:xMIbuLg9Qj2e0egTy+8NcukbhRaVmWwK9vm3aAQZoi4=
github.com/argoproj/notifications-engine v0.4.1-0.20240606074338-0802cd427621 h1:Yg1nt+D2uDK1SL2jSlfukA4yc7db184TTN7iWy3voRE=
github.com/argoproj/notifications-engine v0.4.1-0.20240606074338-0802cd427621/go.mod h1:N0A4sEws2soZjEpY4hgZpQS8mRIEw6otzwfkgc3g9uQ=
github.com/argoproj/pkg v0.13.7-0.20230626144333-d56162821bd1 h1:qsHwwOJ21K2Ao0xPju1sNuqphyMnMYkyB3ZLoLtxWpo=
Expand Down Expand Up @@ -1294,6 +1292,8 @@ github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOl
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo=
github.com/jeremywohl/flatten v1.0.1 h1:LrsxmB3hfwJuE+ptGOijix1PIfOoKLJ3Uee/mzbgtrs=
github.com/jeremywohl/flatten v1.0.1/go.mod h1:4AmD/VxjWcI5SRB0n6szE2A6s2fsNHDLO0nAlMHgfLQ=
github.com/jgwest/gitops-engine v0.1.3-0.20240726204147-73c025b8bd5a h1:By0LRk/OotWPmZ6CtXgo6tAXudWQlJeVmakT3vQTMGI=
github.com/jgwest/gitops-engine v0.1.3-0.20240726204147-73c025b8bd5a/go.mod h1:xMIbuLg9Qj2e0egTy+8NcukbhRaVmWwK9vm3aAQZoi4=
github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.diff.server.side
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_MANIFEST_GENERATION_PATH
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.manifest.generation.path
optional: true
image: quay.io/argoproj/argocd:latest
imagePullPolicy: Always
name: argocd-application-controller
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.ignore.normalizer.jq.timeout
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_MANIFEST_GENERATION_PATH
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.manifest.generation.path
optional: true
image: quay.io/argoproj/argocd:latest
imagePullPolicy: Always
name: argocd-application-controller
Expand Down
10 changes: 5 additions & 5 deletions util/git/creds.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ func (c HTTPSCreds) Environ() (io.Closer, []string, error) {
// We need to actually create two temp files, one for storing cert data and
// another for storing the key. If we fail to create second fail, the first
// must be removed.
certFile, err := os.CreateTemp(argoio.TempDir, "")
certFile, err := os.CreateTemp(argoio.TempPathUseDevShmIfAvailable(), "")
if err == nil {
defer certFile.Close()
keyFile, err = os.CreateTemp(argoio.TempDir, "")
keyFile, err = os.CreateTemp(argoio.TempPathUseDevShmIfAvailable(), "")
if err != nil {
removeErr := os.Remove(certFile.Name())
if removeErr != nil {
Expand Down Expand Up @@ -270,7 +270,7 @@ func (f authFilePaths) Close() error {

func (c SSHCreds) Environ() (io.Closer, []string, error) {
// use the SHM temp dir from util, more secure
file, err := os.CreateTemp(argoio.TempDir, "")
file, err := os.CreateTemp(argoio.TempPathUseDevShmIfAvailable(), "")
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -371,10 +371,10 @@ func (g GitHubAppCreds) Environ() (io.Closer, []string, error) {
// We need to actually create two temp files, one for storing cert data and
// another for storing the key. If we fail to create second fail, the first
// must be removed.
certFile, err := os.CreateTemp(argoio.TempDir, "")
certFile, err := os.CreateTemp(argoio.TempPathUseDevShmIfAvailable(), "")
if err == nil {
defer certFile.Close()
keyFile, err = os.CreateTemp(argoio.TempDir, "")
keyFile, err = os.CreateTemp(argoio.TempPathUseDevShmIfAvailable(), "")
if err != nil {
removeErr := os.Remove(certFile.Name())
if removeErr != nil {
Expand Down
4 changes: 2 additions & 2 deletions util/git/creds_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ func Test_SSHCreds_Environ_WithProxyUserNamePassword(t *testing.T) {
func Test_SSHCreds_Environ_TempFileCleanupOnInvalidProxyURL(t *testing.T) {
// Previously, if the proxy URL was invalid, a temporary file would be left in /dev/shm. This ensures the file is cleaned up in this case.

// countDev returns the number of files in /dev/shm (argoio.TempDir)
// countDev returns the number of files in /dev/shm (argoio.TempPathUseDevShmIfAvailable())
countFilesInDevShm := func() int {
entries, err := os.ReadDir(argoio.TempDir)
entries, err := os.ReadDir(argoio.TempPathUseDevShmIfAvailable())
require.NoError(t, err)

return len(entries)
Expand Down

0 comments on commit bcc5c2d

Please sign in to comment.