diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index 2ad30f2bd71f..dece1b3c0595 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "os" + "strings" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -46,6 +47,7 @@ const ( resourceNameMachine = "machines" resourceNameMachineSet = "machinesets" resourceNameMachineDeployment = "machinedeployments" + failedMachinePrefix = "failed-machine-" ) // machineController watches for Nodes, Machines, MachineSets and @@ -219,6 +221,16 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide } } + if isFailedMachineProviderID(providerID) { + machine, err := c.findMachine(machineKeyFromFailedProviderID(providerID)) + if err != nil { + return nil, err + } + if machine != nil { + return machine.DeepCopy(), nil + } + } + // If the machine object has no providerID--maybe actuator // does not set this value (e.g., OpenStack)--then first // lookup the node using ProviderID. If that is successful @@ -234,6 +246,15 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide return c.findMachine(node.Annotations[machineAnnotationKey]) } +func isFailedMachineProviderID(providerID normalizedProviderID) bool { + return strings.HasPrefix(string(providerID), failedMachinePrefix) +} + +func machineKeyFromFailedProviderID(providerID normalizedProviderID) string { + namespaceName := strings.TrimPrefix(string(providerID), failedMachinePrefix) + return strings.Replace(namespaceName, "_", "/", 1) +} + // findNodeByNodeName finds the Node object keyed by name.. Returns // nil if it cannot be found. A DeepCopy() of the object is returned // on success. @@ -393,6 +414,17 @@ func (c *machineController) machineSetProviderIDs(machineSet *MachineSet) ([]str continue } + if machine.Status.FailureMessage != nil { + klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.Name, *machine.Status.FailureMessage) + // Provide a fake ID to allow the autoscaler to track machines that will never + // become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime. + // Fake ID needs to be recognised later and converted into a machine key. + // Use an underscore as a separator between namespace and name as it is not a + // valid character within a namespace name. + providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name)) + continue + } + if machine.Status.NodeRef == nil { klog.V(4).Infof("Status.NodeRef of machine %q is currently nil", machine.Name) continue diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go index c590738c5572..9fddba979f50 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go @@ -450,7 +450,17 @@ func TestControllerFindMachineByProviderID(t *testing.T) { t.Fatalf("expected machines to be equal - expected %+v, got %+v", testConfig.machines[0], machine) } - // Test #2: Verify machine is not found if it has a + // Test #2: Verify machine returned by fake provider ID is correct machine + fakeProviderID := fmt.Sprintf("%s$s/%s", testConfig.machines[0].Namespace, testConfig.machines[0].Name) + machine, err = controller.findMachineByProviderID(normalizedProviderID(fakeProviderID)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if machine != nil { + t.Fatal("expected find to fail") + } + + // Test #3: Verify machine is not found if it has a // non-existent or different provider ID. machine = testConfig.machines[0].DeepCopy() machine.Spec.ProviderID = pointer.StringPtr("does-not-match") @@ -1126,3 +1136,57 @@ func TestGetAPIGroupPreferredVersion(t *testing.T) { }) } } + +func TestIsFailedMachineProviderID(t *testing.T) { + testCases := []struct { + name string + providerID normalizedProviderID + expected bool + }{ + { + name: "with the failed machine prefix", + providerID: normalizedProviderID(fmt.Sprintf("%sfoo", failedMachinePrefix)), + expected: true, + }, + { + name: "without the failed machine prefix", + providerID: normalizedProviderID("foo"), + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := isFailedMachineProviderID(tc.providerID); got != tc.expected { + t.Errorf("test case: %s, expected: %v, got: %v", tc.name, tc.expected, got) + } + }) + } +} + +func TestMachineKeyFromFailedProviderID(t *testing.T) { + testCases := []struct { + name string + providerID normalizedProviderID + expected string + }{ + { + name: "with a valid failed machine prefix", + providerID: normalizedProviderID(fmt.Sprintf("%stest-namespace_foo", failedMachinePrefix)), + expected: "test-namespace/foo", + }, + { + name: "with a machine with an underscore in the name", + providerID: normalizedProviderID(fmt.Sprintf("%stest-namespace_foo_bar", failedMachinePrefix)), + expected: "test-namespace/foo_bar", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := machineKeyFromFailedProviderID(tc.providerID); got != tc.expected { + t.Errorf("test case: %s, expected: %q, got: %q", tc.name, tc.expected, got) + } + }) + } +} diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go index 9d883625594b..0b90285280f8 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go @@ -115,6 +115,10 @@ func newMachineFromUnstructured(u *unstructured.Unstructured) *Machine { machine.Status.NodeRef = &nodeRef } + if failureMessage, _, _ := unstructured.NestedString(u.Object, "status", "failureMessage"); failureMessage != "" { + machine.Status.FailureMessage = pointer.StringPtr(failureMessage) + } + return &machine } @@ -184,5 +188,9 @@ func newUnstructuredFromMachine(m *Machine) *unstructured.Unstructured { } } + if m.Status.FailureMessage != nil { + unstructured.SetNestedField(u.Object, *m.Status.FailureMessage, "status", "failureMessage") + } + return &u } diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go index 51045309bc10..74da6b97f182 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go @@ -943,3 +943,83 @@ func TestNodeGroupDeleteNodesTwice(t *testing.T) { })) }) } + +func TestNodeGroupWithFailedMachine(t *testing.T) { + test := func(t *testing.T, testConfig *testConfig) { + controller, stop := mustCreateTestController(t, testConfig) + defer stop() + + // Simulate a failed machine + machine := testConfig.machines[3].DeepCopy() + machine.Spec.ProviderID = nil + failureMessage := "FailureMessage" + machine.Status.FailureMessage = &failureMessage + if err := controller.machineInformer.Informer().GetStore().Update(newUnstructuredFromMachine(machine)); err != nil { + t.Fatalf("unexpected error updating machine, got %v", err) + } + + nodegroups, err := controller.nodeGroups() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if l := len(nodegroups); l != 1 { + t.Fatalf("expected 1 nodegroup, got %d", l) + } + + ng := nodegroups[0] + nodeNames, err := ng.Nodes() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(nodeNames) != len(testConfig.nodes) { + t.Fatalf("expected len=%v, got len=%v", len(testConfig.nodes), len(nodeNames)) + } + + sort.SliceStable(nodeNames, func(i, j int) bool { + return nodeNames[i].Id < nodeNames[j].Id + }) + + // The failed machine key is sorted to the first index + failedMachineID := fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name) + if nodeNames[0].Id != failedMachineID { + t.Fatalf("expected %q, got %q", failedMachineID, nodeNames[0].Id) + } + + for i := 1; i < len(nodeNames); i++ { + // Fix the indexing due the failed machine being removed from the list + var nodeIndex int + if i < 4 { + // for nodes 0, 1, 2 + nodeIndex = i - 1 + } else { + // for nodes 4 onwards + nodeIndex = i + } + + if nodeNames[i].Id != string(normalizedProviderString(testConfig.nodes[nodeIndex].Spec.ProviderID)) { + t.Fatalf("expected %q, got %q", testConfig.nodes[nodeIndex].Spec.ProviderID, nodeNames[i].Id) + } + } + } + + // Note: 10 is an upper bound for the number of nodes/replicas + // Going beyond 10 will break the sorting that happens in the + // test() function because sort.Strings() will not do natural + // sorting and the expected semantics in test() will fail. + + t.Run("MachineSet", func(t *testing.T) { + test(t, createMachineSetTestConfig(testNamespace, 10, map[string]string{ + nodeGroupMinSizeAnnotationKey: "1", + nodeGroupMaxSizeAnnotationKey: "10", + })) + }) + + t.Run("MachineDeployment", func(t *testing.T) { + test(t, createMachineDeploymentTestConfig(testNamespace, 10, map[string]string{ + nodeGroupMinSizeAnnotationKey: "1", + nodeGroupMaxSizeAnnotationKey: "10", + })) + }) +} diff --git a/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go b/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go index 4f4e968ac20f..db167b982c9f 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go @@ -63,6 +63,25 @@ type MachineStatus struct { // NodeRef will point to the corresponding Node if it exists. // +optional NodeRef *corev1.ObjectReference `json:"nodeRef,omitempty"` + + // FailureMessage will be set in the event that there is a terminal problem + // reconciling the Machine and will contain a more verbose string suitable + // for logging and human consumption. + // + // This field should not be set for transitive errors that a controller + // faces that are expected to be fixed automatically over + // time (like service outages), but instead indicate that something is + // fundamentally wrong with the Machine's spec or the configuration of + // the controller, and that manual intervention is required. Examples + // of terminal errors would be invalid combinations of settings in the + // spec, values that are unsupported by the controller, or the + // responsible controller itself being critically misconfigured. + // + // Any transient errors that occur during the reconciliation of Machines + // can be added as events to the Machine object and/or logged in the + // controller's output. + // +optional + FailureMessage *string `json:"failureMessage,omitempty"` } // MachineList contains a list of Machine diff --git a/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go b/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go index 9948f120e894..9b702a86260a 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go @@ -318,6 +318,11 @@ func (in *MachineSpec) DeepCopy() *MachineSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MachineStatus) DeepCopyInto(out *MachineStatus) { *out = *in + if in.FailureMessage != nil { + in, out := &in.FailureMessage, &out.FailureMessage + *out = new(string) + **out = **in + } if in.NodeRef != nil { in, out := &in.NodeRef, &out.NodeRef *out = new(v1.ObjectReference)