From 986495fbe2cb0dc25f85f7d55eb309b9d5d0a7c1 Mon Sep 17 00:00:00 2001 From: Steve Kuznetsov Date: Tue, 30 Apr 2024 18:55:31 -0600 Subject: [PATCH] nodepool_controller: add a reconciler for cleanup When we change the config for a NodePool or the way in which we hash the values, we leak token and user data secrets. However, since these secrets are annotated with the NodePool they were created for, it's simple to check that the Secret still matches what we expect and clean it up otherwise. Signed-off-by: Steve Kuznetsov --- .../controllers/nodepool/manifests.go | 17 +- .../nodepool/nodepool_controller.go | 217 +++++++++++++++--- 2 files changed, 194 insertions(+), 40 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/manifests.go b/hypershift-operator/controllers/nodepool/manifests.go index 2842290835a..994b099171a 100644 --- a/hypershift-operator/controllers/nodepool/manifests.go +++ b/hypershift-operator/controllers/nodepool/manifests.go @@ -42,20 +42,23 @@ func machineHealthCheck(nodePool *hyperv1.NodePool, controlPlaneNamespace string } } +const ignitionUserDataPrefix = "user-data" + func IgnitionUserDataSecret(namespace, name, payloadInputHash string) *corev1.Secret { - return &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: fmt.Sprintf("user-data-%s-%s", name, payloadInputHash), - }, - } + return namedSecret(namespace, fmt.Sprintf("%s-%s-%s", ignitionUserDataPrefix, name, payloadInputHash)) } +const tokenSecretPrefix = "token" + func TokenSecret(namespace, name, payloadInputHash string) *corev1.Secret { + return namedSecret(namespace, fmt.Sprintf("%s-%s-%s", tokenSecretPrefix, name, payloadInputHash)) +} + +func namedSecret(namespace, name string) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, - Name: fmt.Sprintf("token-%s-%s", name, payloadInputHash), + Name: name, }, } } diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 17ecefcb7c1..b6f15a01011 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -24,7 +24,6 @@ import ( agentv1 "github.com/openshift/cluster-api-provider-agent/api/v1beta1" performanceprofilev2 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/performanceprofile/v2" tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1" - hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster" "github.com/openshift/hypershift/hypershift-operator/controllers/manifests" @@ -116,7 +115,6 @@ type NodePoolReconciler struct { client.Client recorder record.EventRecorder ReleaseProvider releaseinfo.Provider - controller controller.Controller upsert.CreateOrUpdateProvider HypershiftOperatorImage string ImageMetadataProvider supportutil.ImageMetadataProvider @@ -140,7 +138,7 @@ var ( ) func (r *NodePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { - controller, err := ctrl.NewControllerManagedBy(mgr). + if err := ctrl.NewControllerManagedBy(mgr). For(&hyperv1.NodePool{}). // We want to reconcile when the HostedCluster IgnitionEndpoint is available. Watches(&hyperv1.HostedCluster{}, handler.EnqueueRequestsFromMapFunc(r.enqueueNodePoolsForHostedCluster)). @@ -157,12 +155,22 @@ func (r *NodePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, 10*time.Second), MaxConcurrentReconciles: 10, }). - Build(r) - if err != nil { + Complete(r); err != nil { + return errors.Wrap(err, "failed setting up with a controller manager") + } + + if err := ctrl.NewControllerManagedBy(mgr). + For(&corev1.Secret{}). + WithOptions(controller.Options{ + RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, 10*time.Second), + MaxConcurrentReconciles: 10, + }). + Complete(&secretJanitor{ + NodePoolReconciler: r, + }); err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } - r.controller = controller r.recorder = mgr.GetEventRecorderFor("nodepool-controller") return nil @@ -266,26 +274,10 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho return ctrl.Result{}, nil } - // 1. - Reconcile conditions according to current state of the world. - proxy := globalconfig.ProxyConfig() - globalconfig.ReconcileProxyConfigWithStatusFromHostedCluster(proxy, hcluster) - - // NOTE: The image global config is not injected via userdata or NodePool ignition config. - // It is included directly by the ignition server. However, we need to detect the change - // here to trigger a nodepool update. - image := globalconfig.ImageConfig() - globalconfig.ReconcileImageConfigFromHostedCluster(image, hcluster) - - // Serialize proxy and image into a single string to use in the token secret hash. - globalConfigBytes := bytes.NewBuffer(nil) - enc := json.NewEncoder(globalConfigBytes) - if err := enc.Encode(proxy); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to encode proxy global config: %w", err) - } - if err := enc.Encode(image); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to encode image global config: %w", err) + globalConfig, err := globalConfigString(hcluster) + if err != nil { + return ctrl.Result{}, err } - globalConfig := globalConfigBytes.String() if isAutoscalingEnabled(nodePool) { SetStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolCondition{ @@ -561,11 +553,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho // 3 generic core config resources: fips, ssh and haproxy. // TODO (alberto): consider moving the expectedCoreConfigResources check // into the token Secret controller so we don't block Machine infra creation on this. - expectedCoreConfigResources := 3 - if len(hcluster.Spec.ImageContentSources) > 0 { - // additional core config resource created when image content source specified. - expectedCoreConfigResources += 1 - } + expectedCoreConfigResources := expectedCoreConfigResourcesForHostedCluster(hcluster) config, missingConfigs, err := r.getConfig(ctx, nodePool, expectedCoreConfigResources, controlPlaneNamespace, releaseImage, hcluster) if err != nil { SetStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolCondition{ @@ -642,7 +630,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho } // Signal ignition payload generation - targetPayloadConfigHash := supportutil.HashSimple(config + targetVersion + pullSecretName + globalConfig) + targetPayloadConfigHash := payloadConfigHash(config, targetVersion, pullSecretName, globalConfig) tokenSecret := TokenSecret(controlPlaneNamespace, nodePool.Name, targetPayloadConfigHash) condition, err := r.createValidGeneratedPayloadCondition(ctx, tokenSecret, nodePool.Generation) if err != nil { @@ -817,7 +805,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho userDataSecret := IgnitionUserDataSecret(controlPlaneNamespace, nodePool.GetName(), targetPayloadConfigHash) if result, err := r.CreateOrUpdate(ctx, r.Client, userDataSecret, func() error { - return reconcileUserDataSecret(userDataSecret, nodePool, caCertBytes, tokenBytes, ignEndpoint, targetPayloadConfigHash, proxy) + return reconcileUserDataSecret(userDataSecret, nodePool, caCertBytes, tokenBytes, ignEndpoint, targetPayloadConfigHash, globalconfig.ProxyConfig()) }); err != nil { return ctrl.Result{}, err } else { @@ -970,6 +958,15 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho return ctrl.Result{}, nil } +func expectedCoreConfigResourcesForHostedCluster(hcluster *hyperv1.HostedCluster) int { + expectedCoreConfigResources := 3 + if len(hcluster.Spec.ImageContentSources) > 0 { + // additional core config resource created when image content source specified. + expectedCoreConfigResources += 1 + } + return expectedCoreConfigResources +} + // setMachineAndNodeConditions sets the nodePool's AllMachinesReady and AllNodesHealthy conditions. func (r *NodePoolReconciler) setMachineAndNodeConditions(ctx context.Context, nodePool *hyperv1.NodePool, hc *hyperv1.HostedCluster) error { // Get all Machines for NodePool. @@ -2848,8 +2845,12 @@ func (r *NodePoolReconciler) getPullSecretBytes(ctx context.Context, hostedClust // getPullSecretName retrieves the name of the pull secret in the hosted cluster spec func (r *NodePoolReconciler) getPullSecretName(ctx context.Context, hostedCluster *hyperv1.HostedCluster) (string, error) { + return getPullSecretName(ctx, r.Client, hostedCluster) +} + +func getPullSecretName(ctx context.Context, crclient client.Client, hostedCluster *hyperv1.HostedCluster) (string, error) { pullSecret := &corev1.Secret{} - if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hostedCluster.Namespace, Name: hostedCluster.Spec.PullSecret.Name}, pullSecret); err != nil { + if err := crclient.Get(ctx, client.ObjectKey{Namespace: hostedCluster.Namespace, Name: hostedCluster.Spec.PullSecret.Name}, pullSecret); err != nil { return "", fmt.Errorf("cannot get pull secret %s/%s: %w", hostedCluster.Namespace, hostedCluster.Spec.PullSecret.Name, err) } if _, hasKey := pullSecret.Data[corev1.DockerConfigJsonKey]; !hasKey { @@ -2917,3 +2918,153 @@ func aggregateMachineMessages(msgs []string) string { return builder.String() } + +// secretJanitor reconciles secrets and determines which secrets should remain in the cluster and which should be cleaned up. +// Any secret annotated with a nodePool name should only be on the cluster if the nodePool continues to exist +// and if our current calculation for the inputs to the name matches what the secret is named. +type secretJanitor struct { + *NodePoolReconciler +} + +func (r *secretJanitor) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + log := ctrl.LoggerFrom(ctx) + + secret := &corev1.Secret{} + if err := r.Client.Get(ctx, req.NamespacedName, secret); err != nil { + if apierrors.IsNotFound(err) { + log.Info("not found", "request", req.String()) + return ctrl.Result{}, nil + } + log.Error(err, "error getting secret") + return ctrl.Result{}, err + } + + // only handle secrets that are associated with a NodePool + nodePoolName, annotated := secret.Annotations[nodePoolAnnotation] + if !annotated { + return ctrl.Result{}, nil + } + log = log.WithValues("nodePool", nodePoolName) + + // only handle secret types that we know about explicitly + shouldHandle := false + for _, prefix := range []string{tokenSecretPrefix, ignitionUserDataPrefix} { + if strings.HasPrefix(secret.Name, prefix) { + shouldHandle = true + break + } + } + if !shouldHandle { + return ctrl.Result{}, nil + } + + nodePool := &hyperv1.NodePool{} + if err := r.Client.Get(ctx, supportutil.ParseNamespacedName(nodePoolName), nodePool); err != nil && !apierrors.IsNotFound(err) { + log.Error(err, "error getting nodepool") + return ctrl.Result{}, err + } else if apierrors.IsNotFound(err) { + log.Info("removing secret as nodePool is missing") + return ctrl.Result{}, r.Client.Delete(ctx, secret) + } + + hcluster, err := GetHostedClusterByName(ctx, r.Client, nodePool.GetNamespace(), nodePool.Spec.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + + controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + expectedCoreConfigResources := expectedCoreConfigResourcesForHostedCluster(hcluster) + releaseImage, err := r.getReleaseImage(ctx, hcluster, nodePool.Status.Version, nodePool.Spec.Release.Image) + if err != nil { + return ctrl.Result{}, err + } + + config, missingConfigs, err := r.getConfig(ctx, nodePool, expectedCoreConfigResources, controlPlaneNamespace, releaseImage, hcluster) + if err != nil { + return ctrl.Result{}, err + } + if missingConfigs { + return ctrl.Result{}, nil + } + targetVersion := releaseImage.Version() + + pullSecretName, err := r.getPullSecretName(ctx, hcluster) + if err != nil { + return ctrl.Result{}, err + } + + globalConfig, err := globalConfigString(hcluster) + if err != nil { + return ctrl.Result{}, err + } + + targetPayloadConfigHash := payloadConfigHash(config, targetVersion, pullSecretName, globalConfig) + + // synchronously deleting the ignition token is unsafe; we need to clean up tokens by annotating them to expire + synchronousCleanup := func(ctx context.Context, c client.Client, secret *corev1.Secret) error { + return c.Delete(ctx, secret) + } + type nodePoolSecret struct { + expectedName string + matchingPrefix string + cleanup func(context.Context, client.Client, *corev1.Secret) error + } + valid := false + options := []nodePoolSecret{ + { + expectedName: TokenSecret(controlPlaneNamespace, nodePool.Name, targetPayloadConfigHash).Name, + matchingPrefix: tokenSecretPrefix, + cleanup: setExpirationTimestampOnToken, + }, + { + expectedName: IgnitionUserDataSecret(controlPlaneNamespace, nodePool.GetName(), targetPayloadConfigHash).Name, + matchingPrefix: ignitionUserDataPrefix, + cleanup: synchronousCleanup, + }, + } + cleanup := synchronousCleanup + var names []string + for _, option := range options { + names = append(names, option.expectedName) + if secret.Name == option.expectedName { + valid = true + } + if strings.HasPrefix(secret.Name, option.matchingPrefix) { + cleanup = option.cleanup + } + } + + if valid { + return ctrl.Result{}, nil + } + + log.WithValues("options", names, "valid", valid).Info("removing secret as it does not match the expected set of names") + return ctrl.Result{}, cleanup(ctx, r.Client, secret) +} + +func globalConfigString(hcluster *hyperv1.HostedCluster) (string, error) { + // 1. - Reconcile conditions according to current state of the world. + proxy := globalconfig.ProxyConfig() + globalconfig.ReconcileProxyConfigWithStatusFromHostedCluster(proxy, hcluster) + + // NOTE: The image global config is not injected via userdata or NodePool ignition config. + // It is included directly by the ignition server. However, we need to detect the change + // here to trigger a nodepool update. + image := globalconfig.ImageConfig() + globalconfig.ReconcileImageConfigFromHostedCluster(image, hcluster) + + // Serialize proxy and image into a single string to use in the token secret hash. + globalConfigBytes := bytes.NewBuffer(nil) + enc := json.NewEncoder(globalConfigBytes) + if err := enc.Encode(proxy); err != nil { + return "", fmt.Errorf("failed to encode proxy global config: %w", err) + } + if err := enc.Encode(image); err != nil { + return "", fmt.Errorf("failed to encode image global config: %w", err) + } + return globalConfigBytes.String(), nil +} + +func payloadConfigHash(config, targetVersion, pullSecretName, globalConfig string) string { + return supportutil.HashSimple(config + targetVersion + pullSecretName + globalConfig) +}