From 892c41a194893fb69f0263d359d4d053093e62a8 Mon Sep 17 00:00:00 2001 From: Mark Rogers <4752942+mrog@users.noreply.github.com> Date: Tue, 21 Jun 2022 09:26:16 -0600 Subject: [PATCH 1/2] Discard and replace new VMs if they never join the cluster --- api/v1beta1/cloudstackmachine_types.go | 14 +++++++++++++ api/v1beta1/zz_generated.deepcopy.go | 1 + ...e.cluster.x-k8s.io_cloudstackmachines.yaml | 5 +++++ ...loudstackmachinestatechecker_controller.go | 21 +++++++++++++++++-- pkg/cloud/instance.go | 7 ++++++- test/e2e/README.md | 2 +- 6 files changed, 46 insertions(+), 4 deletions(-) diff --git a/api/v1beta1/cloudstackmachine_types.go b/api/v1beta1/cloudstackmachine_types.go index bd1499c3..371a1569 100644 --- a/api/v1beta1/cloudstackmachine_types.go +++ b/api/v1beta1/cloudstackmachine_types.go @@ -19,6 +19,7 @@ package v1beta1 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "time" ) const ( @@ -130,10 +131,23 @@ type CloudStackMachineStatus struct { // +optional InstanceState InstanceState `json:"instanceState,omitempty"` + // InstanceStateLastUpdated is the time the instance state was last updated. + // +optional + InstanceStateLastUpdated metav1.Time `json:"instanceStateLastUpdated,omitempty"` + // Ready indicates the readiness of the provider resource. Ready bool `json:"ready"` } +// TimeSinceLastStateChange returns the amount of time that's elapsed since the state was last updated. If the state +// hasn't ever been updated, it returns a negative value. +func (s *CloudStackMachineStatus) TimeSinceLastStateChange() time.Duration { + if s.InstanceStateLastUpdated.IsZero() { + return time.Duration(-1) + } + return time.Since(s.InstanceStateLastUpdated.Time) +} + // +kubebuilder:object:root=true // +kubebuilder:resource:path=cloudstackmachines,scope=Namespaced,categories=cluster-api,shortName=csm // +kubebuilder:storageversion diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 73c3f42b..34132290 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -541,6 +541,7 @@ func (in *CloudStackMachineStatus) DeepCopyInto(out *CloudStackMachineStatus) { *out = make([]v1.NodeAddress, len(*in)) copy(*out, *in) } + in.InstanceStateLastUpdated.DeepCopyInto(&out.InstanceStateLastUpdated) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CloudStackMachineStatus. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudstackmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudstackmachines.yaml index a0fead03..cfbf4704 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudstackmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudstackmachines.yaml @@ -235,6 +235,11 @@ spec: description: InstanceState is the state of the CloudStack instance for this machine. type: string + instanceStateLastUpdated: + description: InstanceStateLastUpdated is the time the instance state + was last updated. + format: date-time + type: string ready: description: Ready indicates the readiness of the provider resource. type: boolean diff --git a/controllers/cloudstackmachinestatechecker_controller.go b/controllers/cloudstackmachinestatechecker_controller.go index 605a1ed9..55a18998 100644 --- a/controllers/cloudstackmachinestatechecker_controller.go +++ b/controllers/cloudstackmachinestatechecker_controller.go @@ -70,6 +70,7 @@ func (r *CloudStackMachineStateCheckerReconciliationRunner) Reconcile() (ctrl.Re if res, err := r.GetParent(r.ReconciliationSubject, r.CSMachine)(); r.ShouldReturn(res, err) { return res, err } + if res, err := r.GetParent(r.CSMachine, r.CAPIMachine)(); r.ShouldReturn(res, err) { return res, err } @@ -77,9 +78,25 @@ func (r *CloudStackMachineStateCheckerReconciliationRunner) Reconcile() (ctrl.Re if err := r.CSClient.ResolveVMInstanceDetails(r.CSMachine); err != nil && !strings.Contains(strings.ToLower(err.Error()), "no match found") { return r.ReturnWrappedError(err, "failed to resolve VM instance details") } - if r.CSMachine.Status.InstanceState == "Running" { + + csRunning := r.CSMachine.Status.InstanceState == "Running" + csTimeInState := r.CSMachine.Status.TimeSinceLastStateChange() + capiRunning := r.CAPIMachine.Status.Phase == "Running" + + // capiTimeout indicates that a new VM is running, but it isn't reachable due to a network issue or a misconfiguration. + // When this happens, the machine should be deleted or else the cluster won't ever recover. + capiTimeout := csRunning && !capiRunning && csTimeInState > 5*time.Minute + + if csRunning && capiRunning { r.ReconciliationSubject.Status.Ready = true - } else { + } else if !csRunning || capiTimeout { + r.Log.Info("CloudStack instance in bad state", + "name", r.CSMachine.Name, + "instance-id", r.CSMachine.Spec.InstanceID, + "cs-state", r.CSMachine.Status.InstanceState, + "cs-time-in-state", csTimeInState.String(), + "capi-phase", r.CAPIMachine.Status.Phase) + if err := r.K8sClient.Delete(r.RequestCtx, r.CAPIMachine); err != nil { return r.ReturnWrappedError(err, "failed to delete CAPI machine") } diff --git a/pkg/cloud/instance.go b/pkg/cloud/instance.go index 29e7658c..d899e1d6 100644 --- a/pkg/cloud/instance.go +++ b/pkg/cloud/instance.go @@ -18,6 +18,7 @@ package cloud import ( "fmt" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "strings" capiv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -41,7 +42,11 @@ func setMachineDataFromVMMetrics(vmResponse *cloudstack.VirtualMachinesMetric, c csMachine.Spec.ProviderID = pointer.StringPtr(fmt.Sprintf("cloudstack:///%s", vmResponse.Id)) csMachine.Spec.InstanceID = pointer.StringPtr(vmResponse.Id) csMachine.Status.Addresses = []corev1.NodeAddress{{Type: corev1.NodeInternalIP, Address: vmResponse.Ipaddress}} - csMachine.Status.InstanceState = infrav1.InstanceState(vmResponse.State) + newInstanceState := infrav1.InstanceState(vmResponse.State) + if newInstanceState != csMachine.Status.InstanceState || (newInstanceState != "" && csMachine.Status.InstanceStateLastUpdated.IsZero()) { + csMachine.Status.InstanceState = newInstanceState + csMachine.Status.InstanceStateLastUpdated = metav1.Now() + } } // ResolveVMInstanceDetails Retrieves VM instance details by csMachine.Spec.InstanceID or csMachine.Name, and diff --git a/test/e2e/README.md b/test/e2e/README.md index cb474370..6aed2e4d 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -58,6 +58,6 @@ You can specify JOB environment variable which value is a regular expression to For example, ```shell -JOB=PR-Blocking make run +JOB=PR-Blocking make run-e2e ``` This command runs the e2e tests that contains `PR-Blocking` in their spec names. From 187d4689fdaeb5aa2c3718b5876c5777764d1fbd Mon Sep 17 00:00:00 2001 From: Mark Rogers <4752942+mrog@users.noreply.github.com> Date: Tue, 21 Jun 2022 11:31:56 -0600 Subject: [PATCH 2/2] Added unit tests --- api/v1beta1/cloudstackmachine_types_test.go | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 api/v1beta1/cloudstackmachine_types_test.go diff --git a/api/v1beta1/cloudstackmachine_types_test.go b/api/v1beta1/cloudstackmachine_types_test.go new file mode 100644 index 00000000..4344db25 --- /dev/null +++ b/api/v1beta1/cloudstackmachine_types_test.go @@ -0,0 +1,46 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1beta1_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + infrav1 "sigs.k8s.io/cluster-api-provider-cloudstack/api/v1beta1" + "time" +) + +var _ = Describe("CloudStackMachine types", func() { + var cloudStackMachine infrav1.CloudStackMachine + + BeforeEach(func() { // Reset test vars to initial state. + cloudStackMachine = infrav1.CloudStackMachine{} + }) + + Context("When calculating time since state change", func() { + It("Return the correct value when the last state update time is known", func() { + delta := time.Duration(10 * time.Minute) + lastUpdated := time.Now().Add(-delta) + cloudStackMachine.Status.InstanceStateLastUpdated = metav1.NewTime(lastUpdated) + Ω(cloudStackMachine.Status.TimeSinceLastStateChange()).Should(BeNumerically("~", delta, time.Second)) + }) + + It("Return a negative value when the last state update time is unknown", func() { + Ω(cloudStackMachine.Status.TimeSinceLastStateChange()).Should(BeNumerically("<", 0)) + }) + }) +})