diff --git a/test/e2e/config/docker-ci.yaml b/test/e2e/config/docker-ci.yaml index ff7d880bcb85..06e6658edf3b 100644 --- a/test/e2e/config/docker-ci.yaml +++ b/test/e2e/config/docker-ci.yaml @@ -79,4 +79,4 @@ intervals: default/wait-worker-nodes: ["5m", "10s"] default/wait-delete-cluster: ["3m", "10s"] default/wait-machine-upgrade: ["20m", "10s"] - default/wait-machine-remediation: ["3m", "10s"] + default/wait-machine-remediation: ["5m", "10s"] diff --git a/test/e2e/data/infrastructure-docker/cluster-template-ci.yaml b/test/e2e/data/infrastructure-docker/cluster-template-ci.yaml index fbc0f5a2acea..12a397dee990 100644 --- a/test/e2e/data/infrastructure-docker/cluster-template-ci.yaml +++ b/test/e2e/data/infrastructure-docker/cluster-template-ci.yaml @@ -93,6 +93,9 @@ spec: selector: matchLabels: template: + metadata: + labels: + "nodepool": "pool1" spec: clusterName: "${ CLUSTER_NAME }" version: "${ KUBERNETES_VERSION }" @@ -105,3 +108,18 @@ spec: name: "${ CLUSTER_NAME }-md-0" apiVersion: infrastructure.cluster.x-k8s.io/v1alpha3 kind: DockerMachineTemplate +--- +apiVersion: cluster.x-k8s.io/v1alpha3 +kind: MachineHealthCheck +metadata: + name: "${CLUSTER_NAME}-mhc-0" +spec: + clusterName: "${ CLUSTER_NAME }" + maxUnhealthy: 100% + selector: + matchLabels: + nodepool: "pool1" + unhealthyConditions: + - type: E2ENodeUnhealthy + status: "True" + timeout: 30s \ No newline at end of file diff --git a/test/e2e/data/infrastructure-docker/cluster-template.yaml b/test/e2e/data/infrastructure-docker/cluster-template.yaml index fbc0f5a2acea..12a397dee990 100644 --- a/test/e2e/data/infrastructure-docker/cluster-template.yaml +++ b/test/e2e/data/infrastructure-docker/cluster-template.yaml @@ -93,6 +93,9 @@ spec: selector: matchLabels: template: + metadata: + labels: + "nodepool": "pool1" spec: clusterName: "${ CLUSTER_NAME }" version: "${ KUBERNETES_VERSION }" @@ -105,3 +108,18 @@ spec: name: "${ CLUSTER_NAME }-md-0" apiVersion: infrastructure.cluster.x-k8s.io/v1alpha3 kind: DockerMachineTemplate +--- +apiVersion: cluster.x-k8s.io/v1alpha3 +kind: MachineHealthCheck +metadata: + name: "${CLUSTER_NAME}-mhc-0" +spec: + clusterName: "${ CLUSTER_NAME }" + maxUnhealthy: 100% + selector: + matchLabels: + nodepool: "pool1" + unhealthyConditions: + - type: E2ENodeUnhealthy + status: "True" + timeout: 30s \ No newline at end of file diff --git a/test/e2e/mhc_remediations.go b/test/e2e/mhc_remediations.go new file mode 100644 index 000000000000..d5b7e77e4ed8 --- /dev/null +++ b/test/e2e/mhc_remediations.go @@ -0,0 +1,108 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "fmt" + "os" + "path/filepath" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/pointer" + clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/cluster-api/test/framework" + "sigs.k8s.io/cluster-api/test/framework/clusterctl" + "sigs.k8s.io/cluster-api/util" +) + +// MachineRemediationSpecInput is the input for MachineRemediationSpec. +type MachineRemediationSpecInput struct { + E2EConfig *clusterctl.E2EConfig + ClusterctlConfigPath string + BootstrapClusterProxy framework.ClusterProxy + ArtifactFolder string + SkipCleanup bool +} + +// MachineRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions. +func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemediationSpecInput) { + var ( + specName = "mhc-remediation" + input MachineRemediationSpecInput + namespace *corev1.Namespace + cancelWatches context.CancelFunc + cluster *clusterv1.Cluster + ) + + BeforeEach(func() { + Expect(ctx).NotTo(BeNil(), "ctx is required for %s spec", specName) + input = inputGetter() + Expect(input.E2EConfig).ToNot(BeNil(), "Invalid argument. input.E2EConfig can't be nil when calling %s spec", specName) + Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling %s spec", specName) + Expect(input.BootstrapClusterProxy).ToNot(BeNil(), "Invalid argument. input.BootstrapClusterProxy can't be nil when calling %s spec", specName) + Expect(os.MkdirAll(input.ArtifactFolder, 0755)).To(Succeed(), "Invalid argument. input.ArtifactFolder can't be created for %s spec", specName) + Expect(input.E2EConfig.Variables).To(HaveKey(KubernetesVersion)) + Expect(input.E2EConfig.Variables).To(HaveKey(CNIPath)) + + // Setup a Namespace where to host objects for this spec and create a watcher for the namespace events. + namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder) + }) + + It("Should successfully remediate unhealthy machines with MachineHealthCheck", func() { + + By("Creating a workload cluster") + + var mds []*clusterv1.MachineDeployment + cluster, _, mds = clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + ConfigCluster: clusterctl.ConfigClusterInput{ + LogFolder: filepath.Join(input.ArtifactFolder, "clusters", input.BootstrapClusterProxy.GetName()), + ClusterctlConfigPath: input.ClusterctlConfigPath, + KubeconfigPath: input.BootstrapClusterProxy.GetKubeconfigPath(), + InfrastructureProvider: clusterctl.DefaultInfrastructureProvider, + Flavor: clusterctl.DefaultFlavor, + Namespace: namespace.Name, + ClusterName: fmt.Sprintf("cluster-%s", util.RandomString(6)), + KubernetesVersion: input.E2EConfig.GetVariable(KubernetesVersion), + ControlPlaneMachineCount: pointer.Int64Ptr(1), + WorkerMachineCount: pointer.Int64Ptr(1), + }, + CNIManifestPath: input.E2EConfig.GetVariable(CNIPath), + WaitForClusterIntervals: input.E2EConfig.GetIntervals(specName, "wait-cluster"), + WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"), + WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), + }) + + By("Waiting for MachineHealthCheck remediation") + framework.DiscoverMachineHealthChecksAndWaitForRemediation(ctx, framework.DiscoverMachineHealthCheckAndWaitForRemediationInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + WaitForMachineRemediation: input.E2EConfig.GetIntervals(specName, "wait-machine-remediation"), + }) + + By("PASSED!") + }) + + AfterEach(func() { + // Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself. + dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, cluster, input.E2EConfig.GetIntervals, input.SkipCleanup) + }) +} diff --git a/test/e2e/mhc_remediations_test.go b/test/e2e/mhc_remediations_test.go new file mode 100644 index 000000000000..2c7d534210d5 --- /dev/null +++ b/test/e2e/mhc_remediations_test.go @@ -0,0 +1,39 @@ +// +build e2e + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + + . "github.com/onsi/ginkgo" +) + +var _ = Describe("When testing unhealthy machines remediation", func() { + + MachineRemediationSpec(context.TODO(), func() MachineRemediationSpecInput { + return MachineRemediationSpecInput{ + E2EConfig: e2eConfig, + ClusterctlConfigPath: clusterctlConfigPath, + BootstrapClusterProxy: bootstrapClusterProxy, + ArtifactFolder: artifactFolder, + SkipCleanup: skipCleanup, + } + }) + +}) diff --git a/test/framework/machine_helpers.go b/test/framework/machine_helpers.go index 1c8eb16e9095..bd7ea0561aa9 100644 --- a/test/framework/machine_helpers.go +++ b/test/framework/machine_helpers.go @@ -57,6 +57,29 @@ func GetMachinesByMachineDeployments(ctx context.Context, input GetMachinesByMac return machineList.Items } +// GetMachinesByMachineHealthCheckInput is the input for GetMachinesByMachineHealthCheck. +type GetMachinesByMachineHealthCheckInput struct { + Lister Lister + ClusterName string + MachineHealthCheck *clusterv1.MachineHealthCheck +} + +// GetMachinesByMachineHealthCheckInput returns Machine objects for a cluster that match with MachineHealthCheck selector. +func GetMachinesByMachineHealthCheck(ctx context.Context, input GetMachinesByMachineHealthCheckInput) []clusterv1.Machine { + Expect(ctx).NotTo(BeNil(), "ctx is required for GetMachinesByMachineDeployments") + Expect(input.Lister).ToNot(BeNil(), "Invalid argument. input.Lister can't be nil when calling GetMachinesByMachineHealthCheck") + Expect(input.ClusterName).ToNot(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling GetMachinesByMachineHealthCheck") + Expect(input.MachineHealthCheck).ToNot(BeNil(), "Invalid argument. input.MachineHealthCheck can't be nil when calling GetMachinesByMachineHealthCheck") + + opts := byClusterOptions(input.ClusterName, input.MachineHealthCheck.Namespace) + opts = append(opts, machineHealthCheckOptions(*input.MachineHealthCheck)...) + + machineList := &clusterv1.MachineList{} + Expect(input.Lister.List(ctx, machineList, opts...)).To(Succeed(), "Failed to list MachineList object for Cluster %s/%s", input.MachineHealthCheck.Namespace, input.ClusterName) + + return machineList.Items +} + // GetControlPlaneMachinesByClusterInput is the input for GetControlPlaneMachinesByCluster. type GetControlPlaneMachinesByClusterInput struct { Lister Lister @@ -100,7 +123,7 @@ func WaitForControlPlaneMachinesToBeUpgraded(ctx context.Context, input WaitForC fmt.Fprintf(GinkgoWriter, "Ensuring all MachineDeployment Machines have upgraded kubernetes version %s\n", input.KubernetesUpgradeVersion) Eventually(func() (int, error) { - machines := GetControlPlaneMachinesByCluster(context.TODO(), GetControlPlaneMachinesByClusterInput{ + machines := GetControlPlaneMachinesByCluster(ctx, GetControlPlaneMachinesByClusterInput{ Lister: input.Lister, ClusterName: input.Cluster.Name, Namespace: input.Cluster.Namespace, @@ -139,7 +162,7 @@ func WaitForMachineDeploymentMachinesToBeUpgraded(ctx context.Context, input Wai fmt.Fprintf(GinkgoWriter, "Ensuring all MachineDeployment Machines have upgraded kubernetes version %s\n", input.KubernetesUpgradeVersion) Eventually(func() (int, error) { - machines := GetMachinesByMachineDeployments(context.TODO(), GetMachinesByMachineDeploymentsInput{ + machines := GetMachinesByMachineDeployments(ctx, GetMachinesByMachineDeploymentsInput{ Lister: input.Lister, ClusterName: input.Cluster.Name, Namespace: input.Cluster.Namespace, diff --git a/test/framework/machinehealthcheck_helpers.go b/test/framework/machinehealthcheck_helpers.go new file mode 100644 index 000000000000..03e18b9ec916 --- /dev/null +++ b/test/framework/machinehealthcheck_helpers.go @@ -0,0 +1,173 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DiscoverMachineHealthCheckAndWaitForRemediationInput is the input for DiscoverMachineHealthCheckAndWait. +type DiscoverMachineHealthCheckAndWaitForRemediationInput struct { + ClusterProxy ClusterProxy + Cluster *clusterv1.Cluster + WaitForMachineRemediation []interface{} +} + +// DiscoverMachineHealthCheckAndWait patches an unhealthy node condition to one node observed by the Machine Health Check and then wait for remediation. +func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input DiscoverMachineHealthCheckAndWaitForRemediationInput) { + Expect(ctx).NotTo(BeNil(), "ctx is required for DiscoverMachineHealthChecksAndWaitForRemediation") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation") + + mgmtClient := input.ClusterProxy.GetClient() + fmt.Fprintln(GinkgoWriter, "Discovering machine health check resources") + machineHealthChecks := GetMachineHealthChecksForCluster(ctx, GetMachineHealthChecksForClusterInput{ + Lister: mgmtClient, + ClusterName: input.Cluster.Name, + Namespace: input.Cluster.Namespace, + }) + + Expect(machineHealthChecks).NotTo(BeEmpty()) + + for _, mhc := range machineHealthChecks { + Expect(mhc.Spec.UnhealthyConditions).NotTo(BeEmpty()) + + fmt.Fprintln(GinkgoWriter, "Ensuring there is at least 1 Machine that MachineHealthCheck is matching") + machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ + Lister: mgmtClient, + ClusterName: input.Cluster.Name, + MachineHealthCheck: mhc, + }) + + Expect(machines).NotTo(BeEmpty()) + + fmt.Fprintln(GinkgoWriter, "Patching MachineHealthCheck unhealthy condition to one of the nodes") + unhealthyNodeCondition := corev1.NodeCondition{ + Type: mhc.Spec.UnhealthyConditions[0].Type, + Status: mhc.Spec.UnhealthyConditions[0].Status, + LastTransitionTime: metav1.Time{Time: time.Now()}, + } + PatchNodeCondition(ctx, PatchNodeConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + NodeCondition: unhealthyNodeCondition, + Machine: machines[0], + }) + } + + fmt.Fprintln(GinkgoWriter, "Waiting for remediation") + WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx, WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + MachineHealthChecks: machineHealthChecks, + }, input.WaitForMachineRemediation...) +} + +// GetMachineHealthChecksForClusterInput is the input for GetMachineHealthChecksForCluster. +type GetMachineHealthChecksForClusterInput struct { + Lister Lister + ClusterName string + Namespace string +} + +// GetMachineHealthChecksForCluster returns the MachineHealthCheck objects for a cluster. +// Important! this method relies on labels that are created by the CAPI controllers during the first reconciliation, so +// it is necessary to ensure this is already happened before calling it. +func GetMachineHealthChecksForCluster(ctx context.Context, input GetMachineHealthChecksForClusterInput) []*clusterv1.MachineHealthCheck { + machineHealthCheckList := &clusterv1.MachineHealthCheckList{} + Expect(input.Lister.List(ctx, machineHealthCheckList, byClusterOptions(input.ClusterName, input.Namespace)...)).To(Succeed(), "Failed to list MachineDeployments object for Cluster %s/%s", input.Namespace, input.ClusterName) + + machineHealthChecks := make([]*clusterv1.MachineHealthCheck, len(machineHealthCheckList.Items)) + for i := range machineHealthCheckList.Items { + machineHealthChecks[i] = &machineHealthCheckList.Items[i] + } + return machineHealthChecks +} + +// machineHealthCheckOptions returns a set of ListOptions that allows to get all machine objects belonging to a MachineHealthCheck. +func machineHealthCheckOptions(machineHealthCheck clusterv1.MachineHealthCheck) []client.ListOption { + return []client.ListOption{ + client.MatchingLabels(machineHealthCheck.Spec.Selector.MatchLabels), + } +} + +// WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput is the input for WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition. +type WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput struct { + ClusterProxy ClusterProxy + Cluster *clusterv1.Cluster + MachineHealthChecks []*clusterv1.MachineHealthCheck +} + +// WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition patches a node condition to any one of the machines with a node ref. +func WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx context.Context, input WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput, intervals ...interface{}) { + Expect(ctx).NotTo(BeNil(), "ctx is required for WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.MachineHealthChecks).NotTo(BeEmpty(), "Invalid argument. input.MachineHealthChecks can't be empty when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + + for i := range input.MachineHealthChecks { + mhc := input.MachineHealthChecks[i] + fmt.Fprintln(GinkgoWriter, "Waiting until the node with unhealthy node condition is remediated") + Eventually(func() bool { + machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ + Lister: input.ClusterProxy.GetClient(), + ClusterName: input.Cluster.Name, + MachineHealthCheck: mhc, + }) + Expect(machines).NotTo(BeEmpty()) + + for _, machine := range machines { + if machine.Status.NodeRef == nil { + return false + } + node := &corev1.Node{} + // This should not be an Expect(), because it may return error during machine deletion. + err := input.ClusterProxy.GetWorkloadCluster(ctx, input.Cluster.Namespace, input.Cluster.Name).GetClient().Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name, Namespace: machine.Status.NodeRef.Namespace}, node) + if err != nil { + return false + } + if hasMatchingUnhealthyConditions(mhc, node.Status.Conditions) { + return false + } + } + return true + }, intervals...).Should(BeTrue()) + } +} + +// hasMatchingUnhealthyConditions returns true if any node condition matches with machine health check unhealthy conditions +func hasMatchingUnhealthyConditions(machineHealthCheck *clusterv1.MachineHealthCheck, nodeConditions []corev1.NodeCondition) bool { + for _, unhealthyCondition := range machineHealthCheck.Spec.UnhealthyConditions { + for _, nodeCondition := range nodeConditions { + if nodeCondition.Type == unhealthyCondition.Type && nodeCondition.Status == unhealthyCondition.Status { + return true + } + } + } + return false +}