Skip to content

Commit

Permalink
MGMT-4261 Agent CR cleanup when deleting clusterDeployment
Browse files Browse the repository at this point in the history
clusterdeployments_controller:
When a clusterDeployment CR gets deleted, the controller detects the
relevant agents and pushes updates to the agent_controller, which
in turn will reconcile and delete them.

agent_controller:
The controller will delete agent CR if one of the following scenarios
takes place during reconcile:
1. No clusterDeployment was found for the agent.
2. No cluster (backend) was found for the agent.
3. Agent does not belong to the cluster retrieved from the backend,
   which may happen with newly created day2 clusters.

P.S.
Note that this PR does not cover host dergister upon Agent CR deletion, however
hosts get cleaned up from the backend for the above-mentioned scenarios.
  • Loading branch information
Nir Magnezi committed Apr 29, 2021
1 parent d10163a commit a73b8fe
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 24 deletions.
4 changes: 4 additions & 0 deletions internal/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ func (m *Manager) DeregisterCluster(ctx context.Context, c *common.Cluster) erro
} else {
m.eventsHandler.AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo,
fmt.Sprintf("Deregistered cluster: %s", c.ID.String()), time.Now())
for _, h := range c.Hosts {
m.eventsHandler.AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo,
fmt.Sprintf("Deregistered cluster: %s host %s", c.ID.String(), h.ID.String()), time.Now())
}
}
return err
}
Expand Down
2 changes: 2 additions & 0 deletions internal/cluster/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2753,6 +2753,7 @@ var _ = Describe("Validation metrics and events", func() {
mockHost.EXPECT().ReportValidationFailedMetrics(ctx, gomock.Any(), openshiftVersion, emailDomain)
mockMetric.EXPECT().ClusterValidationFailed(openshiftVersion, emailDomain, models.ClusterValidationIDSufficientMastersCount)
mockEvents.EXPECT().AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, gomock.Any(), gomock.Any())
mockEvents.EXPECT().AddEvent(ctx, *c.ID, gomock.Any(), models.EventSeverityInfo, gomock.Any(), gomock.Any())

err := m.DeregisterCluster(ctx, c)
Expect(err).ShouldNot(HaveOccurred())
Expand All @@ -2764,6 +2765,7 @@ var _ = Describe("Validation metrics and events", func() {
mockHost.EXPECT().ReportValidationFailedMetrics(ctx, gomock.Any(), openshiftVersion, emailDomain)
mockMetric.EXPECT().ClusterValidationFailed(openshiftVersion, emailDomain, models.ClusterValidationIDSufficientMastersCount)
mockEvents.EXPECT().AddEvent(ctx, *c.ID, nil, models.EventSeverityInfo, gomock.Any(), gomock.Any())
mockEvents.EXPECT().AddEvent(ctx, *c.ID, gomock.Any(), models.EventSeverityInfo, gomock.Any(), gomock.Any())

err := m.DeregisterCluster(ctx, c)
Expect(err).ShouldNot(HaveOccurred())
Expand Down
25 changes: 24 additions & 1 deletion internal/controller/controllers/agent_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -80,6 +81,9 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl

// Retrieve clusterDeployment
if err = r.Get(ctx, kubeKey, clusterDeployment); err != nil {
if k8serrors.IsNotFound(err) {
return r.deleteAgent(ctx, req.NamespacedName)
}
errMsg := fmt.Sprintf("failed to get clusterDeployment with name %s in namespace %s",
agent.Spec.ClusterDeploymentName.Name, agent.Spec.ClusterDeploymentName.Namespace)
// Update that we failed to retrieve the clusterDeployment
Expand All @@ -89,14 +93,18 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
// Retrieve cluster for ClusterDeploymentName from the database
cluster, err := r.Installer.GetClusterByKubeKey(kubeKey)
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
return r.deleteAgent(ctx, req.NamespacedName)
}
// Update that we failed to retrieve the cluster from the database
return r.updateStatus(ctx, agent, nil, err, !errors.Is(err, gorm.ErrRecordNotFound))
}

//Retrieve host from cluster
host := getHostFromCluster(cluster, agent.Name)
if host == nil {
return r.updateStatus(ctx, agent, nil, errors.New("host not found in cluster"), false)
// Host is not a part of the cluster, which may happen with newly created day2 clusters.
return r.deleteAgent(ctx, req.NamespacedName)
}

// check for updates from user, compare spec and update if needed
Expand All @@ -113,6 +121,21 @@ func (r *AgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
return r.updateStatus(ctx, agent, host, nil, false)
}

func (r *AgentReconciler) deleteAgent(ctx context.Context, agent types.NamespacedName) (ctrl.Result, error) {
Requeue := false
agentToDelete := &aiv1beta1.Agent{
ObjectMeta: metav1.ObjectMeta{
Name: agent.Name,
Namespace: agent.Namespace,
},
}
if err := r.Client.Delete(ctx, agentToDelete); err != nil {
r.Log.WithError(err).Errorf("Failed to get delete resource %s %s", agent.Name, agent.Namespace)
Requeue = true
}
return ctrl.Result{Requeue: Requeue}, nil
}

// updateStatus is updating all the Agent Conditions.
// In case that an error has occured when trying to sync the Spec, the error (syncErr) is presented in SpecSyncedCondition.
// Internal bool differentiate between backend server error (internal HTTP 5XX) and user input error (HTTP 4XXX)
Expand Down
25 changes: 6 additions & 19 deletions internal/controller/controllers/agent_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ var _ = Describe("agent reconcile", func() {
Expect(c.Get(ctx, key, agent)).To(BeNil())
})

It("cluster deployment not found", func() {
It("Agent not created when cluster deployment not found", func() {
host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}})
Expect(c.Create(ctx, host)).To(BeNil())
result, err := hr.Reconcile(ctx, newHostRequest(host))
Expand All @@ -114,15 +114,10 @@ var _ = Describe("agent reconcile", func() {
Namespace: testNamespace,
Name: "host",
}
Expect(c.Get(ctx, key, agent)).To(BeNil())
expectedState := fmt.Sprintf("%s failed to get clusterDeployment with name clusterDeployment in namespace test-namespace: clusterdeployments.hive.openshift.io \"clusterDeployment\" not found", InputErrorMsg)
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse))

Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found"))
})

It("cluster not found in database", func() {
It("Agent not created when cluster not found in database", func() {
host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}})
Expect(c.Create(ctx, host)).To(BeNil())
clusterDeployment := newClusterDeployment("clusterDeployment", testNamespace, getDefaultClusterDeploymentSpec("clusterDeployment-test", "pull-secret"))
Expand All @@ -137,11 +132,7 @@ var _ = Describe("agent reconcile", func() {
Namespace: testNamespace,
Name: "host",
}
Expect(c.Get(ctx, key, agent)).To(BeNil())
expectedState := fmt.Sprintf("%s record not found", InputErrorMsg)
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse))
Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found"))
})

It("error getting cluster from database", func() {
Expand All @@ -168,7 +159,7 @@ var _ = Describe("agent reconcile", func() {
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse))
})

It("host not found in cluster", func() {
It("Agent not created when host not found in cluster", func() {
host := newAgent("host", testNamespace, v1beta1.AgentSpec{ClusterDeploymentName: &v1beta1.ClusterReference{Name: "clusterDeployment", Namespace: testNamespace}})
clusterDeployment := newClusterDeployment("clusterDeployment", testNamespace, getDefaultClusterDeploymentSpec("clusterDeployment-test", "pull-secret"))
Expect(c.Create(ctx, clusterDeployment)).To(BeNil())
Expand All @@ -183,11 +174,7 @@ var _ = Describe("agent reconcile", func() {
Namespace: testNamespace,
Name: "host",
}
Expect(c.Get(ctx, key, agent)).To(BeNil())
expectedState := fmt.Sprintf("%s host not found in cluster", InputErrorMsg)
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Message).To(Equal(expectedState))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Reason).To(Equal(InputErrorReason))
Expect(conditionsv1.FindStatusCondition(agent.Status.Conditions, SpecSyncedCondition).Status).To(Equal(corev1.ConditionFalse))
Expect(c.Get(ctx, key, agent).Error()).To(Equal("agents.agent-install.openshift.io \"host\" not found"))
})

It("Agent update", func() {
Expand Down
40 changes: 36 additions & 4 deletions subsystem/kubeapi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,20 @@ func getClusterFromDB(
return cluster
}

func getClusterDeploymentAgents(ctx context.Context, client k8sclient.Client, clusterDeployment types.NamespacedName) *v1beta1.AgentList {
agents := &v1beta1.AgentList{}
clusterAgents := &v1beta1.AgentList{}
err := client.List(ctx, agents)
Expect(err).To(BeNil())
for _, agent := range agents.Items {
if agent.Spec.ClusterDeploymentName.Name == clusterDeployment.Name &&
agent.Spec.ClusterDeploymentName.Namespace == clusterDeployment.Namespace {
clusterAgents.Items = append(clusterAgents.Items, agent)
}
}
return clusterAgents
}

func getClusterDeploymentCRD(ctx context.Context, client k8sclient.Client, key types.NamespacedName) *hivev1.ClusterDeployment {
cluster := &hivev1.ClusterDeployment{}
err := client.Get(ctx, key, cluster)
Expand Down Expand Up @@ -405,15 +419,16 @@ var _ = Describe("[kube-api]cluster installation", func() {
clearDB()
})

It("deploy clusterDeployment with agents and wait for ready", func() {
It("deploy clusterDeployment with agents and wait for ready, delete and verify agents deleted", func() {
secretRef := deployLocalObjectSecretIfNeeded(ctx, kubeClient)
spec := getDefaultClusterDeploymentSpec(secretRef)
By("Create cluster")
deployClusterDeploymentCRD(ctx, kubeClient, spec)
key := types.NamespacedName{
clusterKey := types.NamespacedName{
Namespace: Options.Namespace,
Name: spec.ClusterName,
}
cluster := getClusterFromDB(ctx, kubeClient, db, key, waitForReconcileTimeout)
cluster := getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout)
configureLocalAgentClient(cluster.ID.String())
hosts := make([]*models.Host, 0)
for i := 0; i < 3; i++ {
Expand All @@ -422,6 +437,7 @@ var _ = Describe("[kube-api]cluster installation", func() {
hosts = append(hosts, host)
}
generateFullMeshConnectivity(ctx, "1.2.3.10", hosts...)
By("Approve Agents")
for _, host := range hosts {
hostkey := types.NamespacedName{
Namespace: Options.Namespace,
Expand All @@ -433,7 +449,15 @@ var _ = Describe("[kube-api]cluster installation", func() {
return kubeClient.Update(ctx, agent)
}, "30s", "10s").Should(BeNil())
}
checkClusterCondition(ctx, key, controllers.ClusterReadyForInstallationCondition, controllers.ClusterAlreadyInstallingReason)
By("Verify ClusterDeployment ReadyForInstallation")
checkClusterCondition(ctx, clusterKey, controllers.ClusterReadyForInstallationCondition, controllers.ClusterAlreadyInstallingReason)
By("Delete ClusterDeployment")
err := kubeClient.Delete(ctx, getClusterDeploymentCRD(ctx, kubeClient, clusterKey))
Expect(err).To(BeNil())
By("Verify ClusterDeployment Agents were deleted")
Eventually(func() int {
return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items)
}, "2m", "2s").Should(Equal(0))
})

It("deploy clusterDeployment with agent and update agent", func() {
Expand Down Expand Up @@ -1180,6 +1204,10 @@ var _ = Describe("[kube-api]cluster installation", func() {
cluster = getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout)
Expect(*cluster.Kind).Should(Equal(models.ClusterKindAddHostsCluster))

By("Verify ClusterDeployment Agents were deleted")
Eventually(func() int {
return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items)
}, "2m", "2s").Should(Equal(0))
By("Verify Cluster Metadata")
Eventually(func() bool {
return getClusterDeploymentCRD(ctx, kubeClient, clusterKey).Spec.Installed
Expand Down Expand Up @@ -1266,6 +1294,10 @@ var _ = Describe("[kube-api]cluster installation", func() {
cluster = getClusterFromDB(ctx, kubeClient, db, clusterKey, waitForReconcileTimeout)
Expect(*cluster.Kind).Should(Equal(models.ClusterKindAddHostsCluster))

By("Verify ClusterDeployment Agents were deleted")
Eventually(func() int {
return len(getClusterDeploymentAgents(ctx, kubeClient, clusterKey).Items)
}, "2m", "2s").Should(Equal(0))
By("Add Day 2 host and approve agent")
configureLocalAgentClient(cluster.ID.String())
host := setupNewHost(ctx, "hostnameday2", *cluster.ID)
Expand Down

0 comments on commit a73b8fe

Please sign in to comment.