From a73b8febad236a3d82e618cbb414dcce46971b67 Mon Sep 17 00:00:00 2001 From: Nir Magnezi Date: Wed, 28 Apr 2021 15:30:36 +0300 Subject: [PATCH] MGMT-4261 Agent CR cleanup when deleting clusterDeployment clusterdeployments_controller: When a clusterDeployment CR gets deleted, the controller detects the relevant agents and pushes updates to the agent_controller, which in turn will reconcile and delete them. agent_controller: The controller will delete agent CR if one of the following scenarios takes place during reconcile: 1. No clusterDeployment was found for the agent. 2. No cluster (backend) was found for the agent. 3. Agent does not belong to the cluster retrieved from the backend, which may happen with newly created day2 clusters. P.S. Note that this PR does not cover host dergister upon Agent CR deletion, however hosts get cleaned up from the backend for the above-mentioned scenarios. --- internal/cluster/cluster.go | 4 ++ internal/cluster/cluster_test.go | 2 + .../controllers/agent_controller.go | 25 +++++++++++- .../controllers/agent_controller_test.go | 25 +++--------- subsystem/kubeapi_test.go | 40 +++++++++++++++++-- 5 files changed, 72 insertions(+), 24 deletions(-) diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 4d3e679899d..dca09c0df40 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -233,6 +233,10 @@ func (m *Manager) DeregisterCluster(ctx context.Context, c *common.Cluster) erro } else { m.eventsHandler.AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, fmt.Sprintf("Deregistered cluster: %s", c.ID.String()), time.Now()) + for _, h := range c.Hosts { + m.eventsHandler.AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, + fmt.Sprintf("Deregistered cluster: %s host %s", c.ID.String(), h.ID.String()), time.Now()) + } } return err } diff --git a/internal/cluster/cluster_test.go b/internal/cluster/cluster_test.go index 0b5f0cb911a..758320aaa88 100644 --- a/internal/cluster/cluster_test.go +++ b/internal/cluster/cluster_test.go @@ -2753,6 +2753,7 @@ var _ = Describe("Validation metrics and events", func() { mockHost.EXPECT().ReportValidationFailedMetrics(ctx, gomock.Any(), openshiftVersion, emailDomain) mockMetric.EXPECT().ClusterValidationFailed(openshiftVersion, emailDomain, models.ClusterValidationIDSufficientMastersCount) mockEvents.EXPECT().AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, gomock.Any(), gomock.Any()) + mockEvents.EXPECT().AddEvent(ctx, *c.ID, gomock.Any(), models.EventSeverityInfo, gomock.Any(), gomock.Any()) err := m.DeregisterCluster(ctx, c) Expect(err).ShouldNot(HaveOccurred()) @@ -2764,6 +2765,7 @@ var _ = Describe("Validation metrics and events", func() { mockHost.EXPECT().ReportValidationFailedMetrics(ctx, gomock.Any(), openshiftVersion, emailDomain) mockMetric.EXPECT().ClusterValidationFailed(openshiftVersion, emailDomain, models.ClusterValidationIDSufficientMastersCount) mockEvents.EXPECT().AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, gomock.Any(), gomock.Any()) + mockEvents.EXPECT().AddEvent(ctx, *c.ID, gomock.Any(), models.EventSeverityInfo, gomock.Any(), gomock.Any()) err := m.DeregisterCluster(ctx, c) Expect(err).ShouldNot(HaveOccurred()) diff --git a/internal/controller/controllers/agent_controller.go b/internal/controller/controllers/agent_controller.go index 339e3f2b4b9..49e6924e7db 100644 --- a/internal/controller/controllers/agent_controller.go +++ b/internal/controller/controllers/agent_controller.go @@ -38,6 +38,7 @@ import ( "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -80,6 +81,9 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Retrieve clusterDeployment if err = r.Get(ctx, kubeKey, clusterDeployment); err != nil { + if k8serrors.IsNotFound(err) { + return r.deleteAgent(ctx, req.NamespacedName) + } errMsg := fmt.Sprintf("failed to get clusterDeployment with name %s in namespace %s", agent.Spec.ClusterDeploymentName.Name, agent.Spec.ClusterDeploymentName.Namespace) // Update that we failed to retrieve the clusterDeployment @@ -89,6 +93,9 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Retrieve cluster for ClusterDeploymentName from the database cluster, err := r.Installer.GetClusterByKubeKey(kubeKey) if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + return r.deleteAgent(ctx, req.NamespacedName) + } // Update that we failed to retrieve the cluster from the database return r.updateStatus(ctx, agent, nil, err, !errors.Is(err, gorm.ErrRecordNotFound)) } @@ -96,7 +103,8 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl //Retrieve host from cluster host := getHostFromCluster(cluster, agent.Name) if host == nil { - return r.updateStatus(ctx, agent, nil, errors.New("host not found in cluster"), false) + // Host is not a part of the cluster, which may happen with newly created day2 clusters. + return r.deleteAgent(ctx, req.NamespacedName) } // check for updates from user, compare spec and update if needed @@ -113,6 +121,21 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return r.updateStatus(ctx, agent, host, nil, false) } +func (r *AgentReconciler) deleteAgent(ctx context.Context, agent types.NamespacedName) (ctrl.Result, error) { + Requeue := false + agentToDelete := &aiv1beta1.Agent{ + ObjectMeta: metav1.ObjectMeta{ + Name: agent.Name, + Namespace: agent.Namespace, + }, + } + if err := r.Client.Delete(ctx, agentToDelete); err != nil { + r.Log.WithError(err).Errorf("Failed to get delete resource %s %s", agent.Name, agent.Namespace) + Requeue = true + } + return ctrl.Result{Requeue: Requeue}, nil +} + // updateStatus is updating all the Agent Conditions. // In case that an error has occured when trying to sync the Spec, the error (syncErr) is presented in SpecSyncedCondition. // Internal bool differentiate between backend server error (internal HTTP 5XX) and user input error (HTTP 4XXX) diff --git a/internal/controller/controllers/agent_controller_test.go b/internal/controller/controllers/agent_controller_test.go index 4416bc29a70..2e9652f2b0f 100644 --- a/internal/controller/controllers/agent_controller_test.go +++ b/internal/controller/controllers/agent_controller_test.go @@ -102,7 +102,7 @@ var _ = Describe("agent reconcile", func() { Expect(c.Get(ctx, key, agent)).To(BeNil()) }) - It("cluster deployment not found", func() { + It("Agent not created when cluster deployment not found", func() { host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}}) Expect(c.Create(ctx, host)).To(BeNil()) result, err := hr.Reconcile(ctx, newHostRequest(host)) @@ -114,15 +114,10 @@ var _ = Describe("agent reconcile", func() { Namespace: testNamespace, Name: "host", } - Expect(c.Get(ctx, key, agent)).To(BeNil()) - expectedState := fmt.Sprintf("%s failed to get clusterDeployment with name clusterDeployment in namespace test-namespace: clusterdeployments.hive.openshift.io \"clusterDeployment\" not found", InputErrorMsg) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse)) - + Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found")) }) - It("cluster not found in database", func() { + It("Agent not created when cluster not found in database", func() { host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}}) Expect(c.Create(ctx, host)).To(BeNil()) clusterDeployment := newClusterDeployment("clusterDeployment", testNamespace, getDefaultClusterDeploymentSpec("clusterDeployment-test", "pull-secret")) @@ -137,11 +132,7 @@ var _ = Describe("agent reconcile", func() { Namespace: testNamespace, Name: "host", } - Expect(c.Get(ctx, key, agent)).To(BeNil()) - expectedState := fmt.Sprintf("%s record not found", InputErrorMsg) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse)) + Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found")) }) It("error getting cluster from database", func() { @@ -168,7 +159,7 @@ var _ = Describe("agent reconcile", func() { Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse)) }) - It("host not found in cluster", func() { + It("Agent not created when host not found in cluster", func() { host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}}) clusterDeployment := newClusterDeployment("clusterDeployment", testNamespace, getDefaultClusterDeploymentSpec("clusterDeployment-test", "pull-secret")) Expect(c.Create(ctx, clusterDeployment)).To(BeNil()) @@ -183,11 +174,7 @@ var _ = Describe("agent reconcile", func() { Namespace: testNamespace, Name: "host", } - Expect(c.Get(ctx, key, agent)).To(BeNil()) - expectedState := fmt.Sprintf("%s host not found in cluster", InputErrorMsg) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason)) - Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse)) + Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found")) }) It("Agent update", func() { diff --git a/subsystem/kubeapi_test.go b/subsystem/kubeapi_test.go index f67baa90ee8..8af8d974070 100644 --- a/subsystem/kubeapi_test.go +++ b/subsystem/kubeapi_test.go @@ -192,6 +192,20 @@ func getClusterFromDB( return cluster } +func getClusterDeploymentAgents(ctx context.Context, client k8sclient.Client, clusterDeployment types.NamespacedName) *v1beta1.AgentList { + agents := &v1beta1.AgentList{} + clusterAgents := &v1beta1.AgentList{} + err := client.List(ctx, agents) + Expect(err).To(BeNil()) + for _, agent := range agents.Items { + if agent.Spec.ClusterDeploymentName.Name == clusterDeployment.Name && + agent.Spec.ClusterDeploymentName.Namespace == clusterDeployment.Namespace { + clusterAgents.Items = append(clusterAgents.Items, agent) + } + } + return clusterAgents +} + func getClusterDeploymentCRD(ctx context.Context, client k8sclient.Client, key types.NamespacedName) *hivev1.ClusterDeployment { cluster := &hivev1.ClusterDeployment{} err := client.Get(ctx, key, cluster) @@ -405,15 +419,16 @@ var _ = Describe("[kube-api]cluster installation", func() { clearDB() }) - It("deploy clusterDeployment with agents and wait for ready", func() { + It("deploy clusterDeployment with agents and wait for ready, delete and verify agents deleted", func() { secretRef := deployLocalObjectSecretIfNeeded(ctx, kubeClient) spec := getDefaultClusterDeploymentSpec(secretRef) + By("Create cluster") deployClusterDeploymentCRD(ctx, kubeClient, spec) - key := types.NamespacedName{ + clusterKey := types.NamespacedName{ Namespace: Options.Namespace, Name: spec.ClusterName, } - cluster := getClusterFromDB(ctx, kubeClient, db, key, waitForReconcileTimeout) + cluster := getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout) configureLocalAgentClient(cluster.ID.String()) hosts := make([]*models.Host, 0) for i := 0; i < 3; i++ { @@ -422,6 +437,7 @@ var _ = Describe("[kube-api]cluster installation", func() { hosts = append(hosts, host) } generateFullMeshConnectivity(ctx, "1.2.3.10", hosts...) + By("Approve Agents") for _, host := range hosts { hostkey := types.NamespacedName{ Namespace: Options.Namespace, @@ -433,7 +449,15 @@ var _ = Describe("[kube-api]cluster installation", func() { return kubeClient.Update(ctx, agent) }, "30s", "10s").Should(BeNil()) } - checkClusterCondition(ctx, key, controllers.ClusterReadyForInstallationCondition, controllers.ClusterAlreadyInstallingReason) + By("Verify ClusterDeployment ReadyForInstallation") + checkClusterCondition(ctx, clusterKey, controllers.ClusterReadyForInstallationCondition, controllers.ClusterAlreadyInstallingReason) + By("Delete ClusterDeployment") + err := kubeClient.Delete(ctx, getClusterDeploymentCRD(ctx, kubeClient, clusterKey)) + Expect(err).To(BeNil()) + By("Verify ClusterDeployment Agents were deleted") + Eventually(func() int { + return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items) + }, "2m", "2s").Should(Equal(0)) }) It("deploy clusterDeployment with agent and update agent", func() { @@ -1180,6 +1204,10 @@ var _ = Describe("[kube-api]cluster installation", func() { cluster = getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout) Expect(*cluster.Kind).Should(Equal(models.ClusterKindAddHostsCluster)) + By("Verify ClusterDeployment Agents were deleted") + Eventually(func() int { + return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items) + }, "2m", "2s").Should(Equal(0)) By("Verify Cluster Metadata") Eventually(func() bool { return getClusterDeploymentCRD(ctx, kubeClient, clusterKey).Spec.Installed @@ -1266,6 +1294,10 @@ var _ = Describe("[kube-api]cluster installation", func() { cluster = getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout) Expect(*cluster.Kind).Should(Equal(models.ClusterKindAddHostsCluster)) + By("Verify ClusterDeployment Agents were deleted") + Eventually(func() int { + return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items) + }, "2m", "2s").Should(Equal(0)) By("Add Day 2 host and approve agent") configureLocalAgentClient(cluster.ID.String()) host := setupNewHost(ctx, "hostnameday2", *cluster.ID)