Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-32467: nodepool_controller: add a reconciler for cleanup #3969

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions hypershift-operator/controllers/nodepool/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,23 @@ func machineHealthCheck(nodePool *hyperv1.NodePool, controlPlaneNamespace string
}
}

const ignitionUserDataPrefix = "user-data"

func IgnitionUserDataSecret(namespace, name, payloadInputHash string) *corev1.Secret {
return &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: fmt.Sprintf("user-data-%s-%s", name, payloadInputHash),
},
}
return namedSecret(namespace, fmt.Sprintf("%s-%s-%s", ignitionUserDataPrefix, name, payloadInputHash))
}

const tokenSecretPrefix = "token"

func TokenSecret(namespace, name, payloadInputHash string) *corev1.Secret {
return namedSecret(namespace, fmt.Sprintf("%s-%s-%s", tokenSecretPrefix, name, payloadInputHash))
}

func namedSecret(namespace, name string) *corev1.Secret {
return &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: fmt.Sprintf("token-%s-%s", name, payloadInputHash),
Name: name,
},
}
}
Expand Down
230 changes: 195 additions & 35 deletions hypershift-operator/controllers/nodepool/nodepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ type NodePoolReconciler struct {
client.Client
recorder record.EventRecorder
ReleaseProvider releaseinfo.Provider
controller controller.Controller
upsert.CreateOrUpdateProvider
HypershiftOperatorImage string
ImageMetadataProvider supportutil.ImageMetadataProvider
Expand All @@ -141,7 +140,7 @@ var (
)

func (r *NodePoolReconciler) SetupWithManager(mgr ctrl.Manager) error {
controller, err := ctrl.NewControllerManagedBy(mgr).
if err := ctrl.NewControllerManagedBy(mgr).
For(&hyperv1.NodePool{}, builder.WithPredicates(supportutil.PredicatesForHostedClusterAnnotationScoping(mgr.GetClient()))).
// We want to reconcile when the HostedCluster IgnitionEndpoint is available.
Watches(&hyperv1.HostedCluster{}, handler.EnqueueRequestsFromMapFunc(r.enqueueNodePoolsForHostedCluster), builder.WithPredicates(supportutil.PredicatesForHostedClusterAnnotationScoping(mgr.GetClient()))).
Expand All @@ -158,12 +157,22 @@ func (r *NodePoolReconciler) SetupWithManager(mgr ctrl.Manager) error {
RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, 10*time.Second),
MaxConcurrentReconciles: 10,
}).
Build(r)
if err != nil {
Complete(r); err != nil {
return errors.Wrap(err, "failed setting up with a controller manager")
}

if err := ctrl.NewControllerManagedBy(mgr).
For(&corev1.Secret{}).
WithOptions(controller.Options{
RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, 10*time.Second),
MaxConcurrentReconciles: 10,
}).
Complete(&secretJanitor{
NodePoolReconciler: r,
Comment on lines +165 to +171
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If only Secret metadata is needed, I'm wondering if we can setup the controller-runtime client to cache only the Secret metadata. https://github.com/kubernetes-sigs/controller-runtime/blob/479b723944e34ae42c9911fe01228ff34eb5ca81/pkg/builder/example_test.go#L38

}); err != nil {
return errors.Wrap(err, "failed setting up with a controller manager")
}

r.controller = controller
r.recorder = mgr.GetEventRecorderFor("nodepool-controller")

return nil
Expand Down Expand Up @@ -267,26 +276,10 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho
return ctrl.Result{}, nil
}

// 1. - Reconcile conditions according to current state of the world.
proxy := globalconfig.ProxyConfig()
globalconfig.ReconcileProxyConfigWithStatusFromHostedCluster(proxy, hcluster)

// NOTE: The image global config is not injected via userdata or NodePool ignition config.
// It is included directly by the ignition server. However, we need to detect the change
// here to trigger a nodepool update.
image := globalconfig.ImageConfig()
globalconfig.ReconcileImageConfigFromHostedCluster(image, hcluster)

// Serialize proxy and image into a single string to use in the token secret hash.
globalConfigBytes := bytes.NewBuffer(nil)
enc := json.NewEncoder(globalConfigBytes)
if err := enc.Encode(proxy); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to encode proxy global config: %w", err)
}
if err := enc.Encode(image); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to encode image global config: %w", err)
globalConfig, err := globalConfigString(hcluster)
if err != nil {
return ctrl.Result{}, err
}
globalConfig := globalConfigBytes.String()

if isAutoscalingEnabled(nodePool) {
SetStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolCondition{
Expand Down Expand Up @@ -562,11 +555,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho
// 3 generic core config resources: fips, ssh and haproxy.
// TODO (alberto): consider moving the expectedCoreConfigResources check
// into the token Secret controller so we don't block Machine infra creation on this.
expectedCoreConfigResources := 3
if len(hcluster.Spec.ImageContentSources) > 0 {
// additional core config resource created when image content source specified.
expectedCoreConfigResources += 1
}
expectedCoreConfigResources := expectedCoreConfigResourcesForHostedCluster(hcluster)
config, missingConfigs, err := r.getConfig(ctx, nodePool, expectedCoreConfigResources, controlPlaneNamespace, releaseImage, hcluster)
if err != nil {
SetStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolCondition{
Expand Down Expand Up @@ -643,7 +632,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho
}

// Signal ignition payload generation
targetPayloadConfigHash := supportutil.HashSimple(config + targetVersion + pullSecretName + globalConfig)
targetPayloadConfigHash := payloadConfigHash(config, targetVersion, pullSecretName, globalConfig)
tokenSecret := TokenSecret(controlPlaneNamespace, nodePool.Name, targetPayloadConfigHash)
condition, err := r.createValidGeneratedPayloadCondition(ctx, tokenSecret, nodePool.Generation)
if err != nil {
Expand Down Expand Up @@ -771,7 +760,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho
return ctrl.Result{}, fmt.Errorf("failed to get token Secret: %w", err)
}
if err == nil {
if err := setExpirationTimestampOnToken(ctx, r.Client, tokenSecret); err != nil && !apierrors.IsNotFound(err) {
if err := setExpirationTimestampOnToken(ctx, r.Client, tokenSecret, nil); err != nil && !apierrors.IsNotFound(err) {
return ctrl.Result{}, fmt.Errorf("failed to set expiration on token Secret: %w", err)
}
}
Expand Down Expand Up @@ -818,7 +807,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho

userDataSecret := IgnitionUserDataSecret(controlPlaneNamespace, nodePool.GetName(), targetPayloadConfigHash)
if result, err := r.CreateOrUpdate(ctx, r.Client, userDataSecret, func() error {
return reconcileUserDataSecret(userDataSecret, nodePool, caCertBytes, tokenBytes, ignEndpoint, targetPayloadConfigHash, proxy)
return reconcileUserDataSecret(userDataSecret, nodePool, caCertBytes, tokenBytes, ignEndpoint, targetPayloadConfigHash, globalconfig.ProxyConfig())
}); err != nil {
return ctrl.Result{}, err
} else {
Expand Down Expand Up @@ -996,6 +985,15 @@ func isArchAndPlatformSupported(nodePool *hyperv1.NodePool) bool {
return supported
}

func expectedCoreConfigResourcesForHostedCluster(hcluster *hyperv1.HostedCluster) int {
expectedCoreConfigResources := 3
if len(hcluster.Spec.ImageContentSources) > 0 {
// additional core config resource created when image content source specified.
expectedCoreConfigResources += 1
}
return expectedCoreConfigResources
}

// setMachineAndNodeConditions sets the nodePool's AllMachinesReady and AllNodesHealthy conditions.
func (r *NodePoolReconciler) setMachineAndNodeConditions(ctx context.Context, nodePool *hyperv1.NodePool, hc *hyperv1.HostedCluster) error {
// Get all Machines for NodePool.
Expand Down Expand Up @@ -2827,13 +2825,17 @@ func validateInfraID(infraID string) error {
return nil
}

func setExpirationTimestampOnToken(ctx context.Context, c client.Client, tokenSecret *corev1.Secret) error {
func setExpirationTimestampOnToken(ctx context.Context, c client.Client, tokenSecret *corev1.Secret, now func() time.Time) error {
if now == nil {
now = time.Now
}

// this should be a reasonable value to allow all in flight provisions to complete.
timeUntilExpiry := 2 * time.Hour
if tokenSecret.Annotations == nil {
tokenSecret.Annotations = map[string]string{}
}
tokenSecret.Annotations[hyperv1.IgnitionServerTokenExpirationTimestampAnnotation] = time.Now().Add(timeUntilExpiry).Format(time.RFC3339)
tokenSecret.Annotations[hyperv1.IgnitionServerTokenExpirationTimestampAnnotation] = now().Add(timeUntilExpiry).Format(time.RFC3339)
return c.Update(ctx, tokenSecret)
}

Expand Down Expand Up @@ -2913,8 +2915,12 @@ func (r *NodePoolReconciler) getPullSecretBytes(ctx context.Context, hostedClust

// getPullSecretName retrieves the name of the pull secret in the hosted cluster spec
func (r *NodePoolReconciler) getPullSecretName(ctx context.Context, hostedCluster *hyperv1.HostedCluster) (string, error) {
return getPullSecretName(ctx, r.Client, hostedCluster)
}

func getPullSecretName(ctx context.Context, crclient client.Client, hostedCluster *hyperv1.HostedCluster) (string, error) {
pullSecret := &corev1.Secret{}
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hostedCluster.Namespace, Name: hostedCluster.Spec.PullSecret.Name}, pullSecret); err != nil {
if err := crclient.Get(ctx, client.ObjectKey{Namespace: hostedCluster.Namespace, Name: hostedCluster.Spec.PullSecret.Name}, pullSecret); err != nil {
return "", fmt.Errorf("cannot get pull secret %s/%s: %w", hostedCluster.Namespace, hostedCluster.Spec.PullSecret.Name, err)
}
if _, hasKey := pullSecret.Data[corev1.DockerConfigJsonKey]; !hasKey {
Expand Down Expand Up @@ -2982,3 +2988,157 @@ func aggregateMachineMessages(msgs []string) string {

return builder.String()
}

// secretJanitor reconciles secrets and determines which secrets should remain in the cluster and which should be cleaned up.
// Any secret annotated with a nodePool name should only be on the cluster if the nodePool continues to exist
// and if our current calculation for the inputs to the name matches what the secret is named.
type secretJanitor struct {
*NodePoolReconciler

now func() time.Time
}

func (r *secretJanitor) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
log := ctrl.LoggerFrom(ctx).WithValues("secret", req.String())

secret := &corev1.Secret{}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like this reconciler doesn't need to know the full data for the Secret. Could we instead retrieve just the Secret's object metadata? https://github.com/kubernetes/apimachinery/blob/53e91cb5d2ee17451e94b11b065795aedf252096/pkg/apis/meta/v1/types.go#L1496

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The secrets are already in the manager, so adding a object-metadata watch would degrade performance, paradoxically.

if err := r.Client.Get(ctx, req.NamespacedName, secret); err != nil {
if apierrors.IsNotFound(err) {
log.Info("not found", "request", req.String())
return ctrl.Result{}, nil
}
log.Error(err, "error getting secret")
return ctrl.Result{}, err
}

// only handle secrets that are associated with a NodePool
nodePoolName, annotated := secret.Annotations[nodePoolAnnotation]
if !annotated {
return ctrl.Result{}, nil
stevekuznetsov marked this conversation as resolved.
Show resolved Hide resolved
}
log = log.WithValues("nodePool", nodePoolName)

// only handle secret types that we know about explicitly
shouldHandle := false
for _, prefix := range []string{tokenSecretPrefix, ignitionUserDataPrefix} {
if strings.HasPrefix(secret.Name, prefix) {
shouldHandle = true
break
}
}
if !shouldHandle {
return ctrl.Result{}, nil
}

nodePool := &hyperv1.NodePool{}
if err := r.Client.Get(ctx, supportutil.ParseNamespacedName(nodePoolName), nodePool); err != nil && !apierrors.IsNotFound(err) {
log.Error(err, "error getting nodepool")
return ctrl.Result{}, err
} else if apierrors.IsNotFound(err) {
log.Info("removing secret as nodePool is missing")
return ctrl.Result{}, r.Client.Delete(ctx, secret)
}

hcluster, err := GetHostedClusterByName(ctx, r.Client, nodePool.GetNamespace(), nodePool.Spec.ClusterName)
if err != nil {
return ctrl.Result{}, err
}

controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)
expectedCoreConfigResources := expectedCoreConfigResourcesForHostedCluster(hcluster)
releaseImage, err := r.getReleaseImage(ctx, hcluster, nodePool.Status.Version, nodePool.Spec.Release.Image)
if err != nil {
return ctrl.Result{}, err
}

config, missingConfigs, err := r.getConfig(ctx, nodePool, expectedCoreConfigResources, controlPlaneNamespace, releaseImage, hcluster)
if err != nil {
return ctrl.Result{}, err
}
if missingConfigs {
return ctrl.Result{}, nil
}
targetVersion := releaseImage.Version()

pullSecretName, err := r.getPullSecretName(ctx, hcluster)
if err != nil {
return ctrl.Result{}, err
}

globalConfig, err := globalConfigString(hcluster)
if err != nil {
return ctrl.Result{}, err
}

targetPayloadConfigHash := payloadConfigHash(config, targetVersion, pullSecretName, globalConfig)

// synchronously deleting the ignition token is unsafe; we need to clean up tokens by annotating them to expire
synchronousCleanup := func(ctx context.Context, c client.Client, secret *corev1.Secret) error {
return c.Delete(ctx, secret)
}
type nodePoolSecret struct {
expectedName string
matchingPrefix string
cleanup func(context.Context, client.Client, *corev1.Secret) error
}
valid := false
options := []nodePoolSecret{
{
expectedName: TokenSecret(controlPlaneNamespace, nodePool.Name, targetPayloadConfigHash).Name,
matchingPrefix: tokenSecretPrefix,
cleanup: func(ctx context.Context, c client.Client, secret *corev1.Secret) error {
return setExpirationTimestampOnToken(ctx, c, secret, r.now)
},
},
{
expectedName: IgnitionUserDataSecret(controlPlaneNamespace, nodePool.GetName(), targetPayloadConfigHash).Name,
matchingPrefix: ignitionUserDataPrefix,
cleanup: synchronousCleanup,
},
}
cleanup := synchronousCleanup
var names []string
for _, option := range options {
names = append(names, option.expectedName)
if secret.Name == option.expectedName {
valid = true
}
if strings.HasPrefix(secret.Name, option.matchingPrefix) {
stevekuznetsov marked this conversation as resolved.
Show resolved Hide resolved
cleanup = option.cleanup
}
}

if valid {
return ctrl.Result{}, nil
}

log.WithValues("options", names, "valid", valid).Info("removing secret as it does not match the expected set of names")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By deleting the token secret synchronously here there's a risk we break inflight booting instances about to run ignition. That's in part why we set it for expiration here

if err == nil {
if err := setExpirationTimestampOnToken(ctx, r.Client, tokenSecret); err != nil && !apierrors.IsNotFound(err) {
return ctrl.Result{}, fmt.Errorf("failed to set expiration on token Secret: %w", err)
}
}

cc @relyt0925

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@enxebre in order to annotate it correctly, we need to know that the secret we're reconciling is a token secret. Do you want us to add something machine-readable to identify them, or is some string prefix check on the name sufficiently safe?

return ctrl.Result{}, cleanup(ctx, r.Client, secret)
stevekuznetsov marked this conversation as resolved.
Show resolved Hide resolved
}

func globalConfigString(hcluster *hyperv1.HostedCluster) (string, error) {
// 1. - Reconcile conditions according to current state of the world.
proxy := globalconfig.ProxyConfig()
globalconfig.ReconcileProxyConfigWithStatusFromHostedCluster(proxy, hcluster)

// NOTE: The image global config is not injected via userdata or NodePool ignition config.
// It is included directly by the ignition server. However, we need to detect the change
// here to trigger a nodepool update.
image := globalconfig.ImageConfig()
globalconfig.ReconcileImageConfigFromHostedCluster(image, hcluster)

// Serialize proxy and image into a single string to use in the token secret hash.
globalConfigBytes := bytes.NewBuffer(nil)
enc := json.NewEncoder(globalConfigBytes)
if err := enc.Encode(proxy); err != nil {
return "", fmt.Errorf("failed to encode proxy global config: %w", err)
}
if err := enc.Encode(image); err != nil {
return "", fmt.Errorf("failed to encode image global config: %w", err)
}
return globalConfigBytes.String(), nil
}

func payloadConfigHash(config, targetVersion, pullSecretName, globalConfig string) string {
return supportutil.HashSimple(config + targetVersion + pullSecretName + globalConfig)
}
Loading