diff --git a/docs/development/cp_support_new.md b/docs/development/cp_support_new.md index 99ad55949..2843b42af 100644 --- a/docs/development/cp_support_new.md +++ b/docs/development/cp_support_new.md @@ -66,7 +66,7 @@ The contract between the Machine Controller Manager (MCM) and the Machine Contro 1. Fill in the methods described at `pkg/provider/core.go` to manage VMs on your cloud provider. Comments are provided above each method to help you fill them up with desired `REQUEST` and `RESPONSE` parameters. - A sample provider implementation for these methods can be found [here](https://github.com/gardener/machine-controller-manager-provider-aws/blob/master/pkg/aws/core.go). - Fill in the required methods `CreateMachine()`, and `DeleteMachine()` methods. - - Optionally fill in methods like `GetMachineStatus()`, `ListMachines()`, and `GetVolumeIDs()`. You may choose to fill these once the working of the required methods seems to be working. + - Optionally fill in methods like `GetMachineStatus()`, `InitializeMachine`, `ListMachines()`, and `GetVolumeIDs()`. You may choose to fill these once the working of the required methods seems to be working. - `GetVolumeIDs()` expects VolumeIDs to be decoded from the volumeSpec based on the cloud provider. - There is also an OPTIONAL method `GenerateMachineClassForMigration()` that helps in migration of `{ProviderSpecific}MachineClass` to `MachineClass` CR (custom resource). This only makes sense if you have an existing implementation (in-tree) acting on different CRD types. You would like to migrate this. If not, you MUST return an error (machine error UNIMPLEMENTED) to avoid processing this step. 1. Perform validation of APIs that you have described and make it a part of your methods as required at each request. diff --git a/docs/development/machine_error_codes.md b/docs/development/machine_error_codes.md index 5860aceef..13f148065 100644 --- a/docs/development/machine_error_codes.md +++ b/docs/development/machine_error_codes.md @@ -145,6 +145,64 @@ The MCM MUST implement the specified error recovery behavior when it encounters The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. This string MAY be surfaced by MCM to end users. +#### `InitializeMachine` + +Provider can OPTIONALLY implement this driver call. Else should return a `UNIMPLEMENTED` status in error. +This interface method will be called by the MCM to initialize a new VM just after creation. This can be used to configure network configuration etc. + +- This call requests the provider to initialize a newly created VM backing the machine-object. +- The `InitializeMachineResponse` returned by this method is expected to return + - `ProviderID` that uniquely identifys the VM at the provider. This is expected to match with the `node.Spec.ProviderID` on the node object. + - `NodeName` that is the expected name of the machine when it joins the cluster. It must match with the node name. + +```protobuf +// InitializeMachine call is responsible for VM initialization on the provider. +InitializeMachine(context.Context, *InitializeMachineRequest) (*InitializeMachineResponse, error) + +// InitializeMachineRequest encapsulates params for the VM Initialization operation (Driver.InitializeMachine). +type InitializeMachineRequest struct { + // Machine object representing VM that must be initialized + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// InitializeMachineResponse is the response for VM instance initialization (Driver.InitializeMachine). +type InitializeMachineResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string +} + +``` + +##### InitializeMachine Errors + +If the provider is unable to complete the `InitializeMachine` call successfully, it MUST return a non-ok machine code in the machine status. + +If the conditions defined below are encountered, the provider MUST return the specified machine error code. +The MCM MUST implement the specified error recovery behavior when it encounters the machine error code. + +| machine Code | Condition | Description | Recovery Behavior | Auto Retry Required | +|-----------|-----------|-------------|-------------------|------------| +| 0 OK | Successful | The call was successful in initializing a VM that matches supplied initialization request. The `InitializeMachineResponse` is returned with desired values | | N | +| 5 NOT_FOUND | Timeout | VM Instance for Machine isn't found at provider | Skip Initialization and Continue | N | +| 12 UNIMPLEMENTED | Not implemented | Unimplemented indicates operation is not implemented or not supported/enabled in this service. | Skip Initialization and continue | N | +| 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. | Needs investigation and possible intervention to fix this | Y | +| 17 UNINITIALIZED | Failed Initialization| VM Instance could not be initializaed | Initialization is reattempted in next reconcile cycle | Y | + +The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. +This string MAY be surfaced by MCM to end users. + + #### `DeleteMachine` A Provider is REQUIRED to implement this driver call. @@ -267,6 +325,7 @@ If the conditions defined below are encountered, the provider MUST return the sp | 13 INTERNAL | Major error | Means some invariants expected by underlying system has been broken. If you see one of these errors, something is very broken. | Needs manual intervension to fix this | N | | 14 UNAVAILABLE | Not Available | Unavailable indicates the service is currently unavailable. | Retry operation after sometime | Y | | 16 UNAUTHENTICATED | Missing provider credentials | Request does not have valid authentication credentials for the operation | Fix the provider credentials | N | +| 17 UNINITIALIZED | Failed Initialization| VM Instance could not be initializaed | Initialization is reattempted in next reconcile cycle | N | The status `message` MUST contain a human readable description of error, if the status `code` is not `OK`. This string MAY be surfaced by MCM to end users. diff --git a/pkg/util/provider/driver/driver.go b/pkg/util/provider/driver/driver.go index 08e6bc286..0d730c735 100644 --- a/pkg/util/provider/driver/driver.go +++ b/pkg/util/provider/driver/driver.go @@ -20,14 +20,23 @@ package driver import ( "context" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" corev1 "k8s.io/api/core/v1" + + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" ) // Driver is the common interface for creation/deletion of the VMs over different cloud-providers. type Driver interface { // CreateMachine call is responsible for VM creation on the provider CreateMachine(context.Context, *CreateMachineRequest) (*CreateMachineResponse, error) + // InitializeMachine call is responsible for VM initialization on the provider. + // This method should only be invoked as a post VM creation initialization to configure network configuration etc. + // + // In case of an error, this operation should return an error with one of the following status codes + // - codes.Unimplemented if the provider does not support VM instance initialization. + // - codes.Uninitialized initialization of VM instance failed due to errors + // - codes.NotFound if VM instance was not found. + InitializeMachine(context.Context, *InitializeMachineRequest) (*InitializeMachineResponse, error) // DeleteMachine call is responsible for VM deletion/termination on the provider DeleteMachine(context.Context, *DeleteMachineRequest) (*DeleteMachineResponse, error) // GetMachineStatus call get's the status of the VM backing the machine object on the provider @@ -64,6 +73,29 @@ type CreateMachineResponse struct { LastKnownState string } +// InitializeMachineRequest encapsulates params for the VM Initialization operation (Driver.InitializeMachine). +type InitializeMachineRequest struct { + // Machine object representing VM that must be initialized + Machine *v1alpha1.Machine + + // MachineClass backing the machine object + MachineClass *v1alpha1.MachineClass + + // Secret backing the machineClass object + Secret *corev1.Secret +} + +// InitializeMachineResponse is the response for VM instance initialization (Driver.InitializeMachine). +type InitializeMachineResponse struct { + // ProviderID is the unique identification of the VM at the cloud provider. + // ProviderID typically matches with the node.Spec.ProviderID on the node object. + // Eg: gce://project-name/region/vm-ID + ProviderID string + + // NodeName is the name of the node-object registered to kubernetes. + NodeName string +} + // DeleteMachineRequest is the delete request for VM deletion type DeleteMachineRequest struct { // Machine object from whom VM is to be deleted diff --git a/pkg/util/provider/driver/fake.go b/pkg/util/provider/driver/fake.go index 83b123434..14bd0941b 100644 --- a/pkg/util/provider/driver/fake.go +++ b/pkg/util/provider/driver/fake.go @@ -74,6 +74,27 @@ func (d *FakeDriver) CreateMachine(ctx context.Context, createMachineRequest *Cr return nil, d.Err } +// InitializeMachine makes a call to the driver to initialize the VM instance of machine. +func (d *FakeDriver) InitializeMachine(ctx context.Context, initMachineRequest *InitializeMachineRequest) (*InitializeMachineResponse, error) { + sErr, ok := status.FromError(d.Err) + if ok && sErr != nil { + switch sErr.Code() { + case codes.NotFound: + d.VMExists = false + return nil, d.Err + case codes.Unimplemented: + break + default: + return nil, d.Err + } + } + d.VMExists = true + return &InitializeMachineResponse{ + ProviderID: d.ProviderID, + NodeName: d.NodeName, + }, d.Err +} + // DeleteMachine make a call to the driver to delete the machine. func (d *FakeDriver) DeleteMachine(ctx context.Context, deleteMachineRequest *DeleteMachineRequest) (*DeleteMachineResponse, error) { d.VMExists = false @@ -92,7 +113,6 @@ func (d *FakeDriver) GetMachineStatus(ctx context.Context, getMachineStatusReque case d.Err != nil: return nil, d.Err } - return &GetMachineStatusResponse{ ProviderID: d.ProviderID, NodeName: d.NodeName, diff --git a/pkg/util/provider/machinecodes/codes/code_string.go b/pkg/util/provider/machinecodes/codes/code_string.go index 3feb6cea9..ab6892332 100644 --- a/pkg/util/provider/machinecodes/codes/code_string.go +++ b/pkg/util/provider/machinecodes/codes/code_string.go @@ -62,6 +62,8 @@ func (c Code) String() string { return "DataLoss" case Unauthenticated: return "Unauthenticated" + case Uninitialized: + return "Uninitialized" default: return "Code(" + strconv.FormatInt(int64(c), 10) + ")" } diff --git a/pkg/util/provider/machinecodes/codes/codes.go b/pkg/util/provider/machinecodes/codes/codes.go index ad9e979d1..fe90de687 100644 --- a/pkg/util/provider/machinecodes/codes/codes.go +++ b/pkg/util/provider/machinecodes/codes/codes.go @@ -144,6 +144,11 @@ const ( // Unauthenticated indicates the request does not have valid // authentication credentials for the operation. Unauthenticated Code = 16 + + // Uninitialized indicates that the VM instance initialization was not performed. + // This is meant to be used by providers in implementation of + // [github.com/gardener/machine-controller-manager/pkg/util/provider/driver.Driver.GetMachineStatus] + Uninitialized Code = 17 ) var strToCode = map[string]Code{ @@ -164,6 +169,7 @@ var strToCode = map[string]Code{ "Unavailable": Unavailable, "DataLoss": DataLoss, "Unauthenticated": Unauthenticated, + "Uninitialized": Uninitialized, } // StringToCode coverts string into the Code. diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 6a8929eff..e2dffab58 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -360,8 +360,9 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque nodeName, providerID string // Initializations - machine = createMachineRequest.Machine - machineName = createMachineRequest.Machine.Name + machine = createMachineRequest.Machine + machineName = createMachineRequest.Machine.Name + uninitializedMachine = false ) // we should avoid mutating Secret, since it goes all the way into the Informer's store @@ -390,6 +391,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque klog.Errorf("Error occurred while decoding machine error for machine %q: %s", machine.Name, err) return machineutils.MediumRetry, err } + klog.Warningf("For machine %q, obtained VM error status as: %s", machineErr) // Decoding machine error code switch machineErr.Code() { @@ -403,7 +405,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque klog.V(2).Infof("The machine creation is triggered with timeout of %s", c.getEffectiveCreationTimeout(createMachineRequest.Machine).Duration) createMachineResponse, err := c.driver.CreateMachine(ctx, createMachineRequest) if err != nil { - // Create call returned an error. + // Create call returned an error klog.Errorf("Error while creating machine %s: %s", machine.Name, err.Error()) return c.machineCreateErrorHandler(ctx, machine, createMachineResponse, err) } @@ -467,6 +469,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name) return machineutils.ShortRetry, err } + uninitializedMachine = true } else { // if node label present that means there must be a backing VM ,without need of GetMachineStatus() call nodeName = machine.Labels[v1alpha1.NodeLabelKey] @@ -496,6 +499,10 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque return machineutils.ShortRetry, err + case codes.Uninitialized: + uninitializedMachine = true + klog.Infof("VM instance associated with machine %s was created but not initialized.", machine.Name) + default: updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, @@ -526,6 +533,13 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque providerID = getMachineStatusResponse.ProviderID } + if uninitializedMachine { + retryPeriod, err := c.initializeMachine(ctx, createMachineRequest.Machine, createMachineRequest.MachineClass, createMachineRequest.Secret) + if err != nil { + return retryPeriod, err + } + } + _, machineNodeLabelPresent := createMachineRequest.Machine.Labels[v1alpha1.NodeLabelKey] _, machinePriorityAnnotationPresent := createMachineRequest.Machine.Annotations[machineutils.MachinePriority] @@ -585,6 +599,57 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque return machineutils.LongRetry, nil } +func (c *controller) initializeMachine(ctx context.Context, machine *v1alpha1.Machine, machineClass *v1alpha1.MachineClass, secret *corev1.Secret) (machineutils.RetryPeriod, error) { + req := &driver.InitializeMachineRequest{ + Machine: machine, + MachineClass: machineClass, + Secret: secret, + } + klog.V(3).Infof("Initializing VM instance for Machine %q", machine.Name) + resp, err := c.driver.InitializeMachine(ctx, req) + if err != nil { + errStatus, ok := status.FromError(err) + if !ok { + klog.Errorf("Cannot decode Driver error for machine %q: %s. Unexpected behaviour as Driver errors are expected to be of type status.Status", machine.Name, err) + return machineutils.LongRetry, err + } + klog.Errorf("Error occurred while initializing VM instance for machine %q: %s", machine.Name, err) + switch errStatus.Code() { + case codes.Unimplemented: + klog.V(3).Infof("Provider does not support Driver.InitializeMachine - skipping VM instance initialization for %q.", machine.Name) + return 0, nil + case codes.NotFound: + klog.Warningf("No VM instance found for machine %q. Skipping VM instance initialization.", machine.Name) + return 0, nil + } + updateRetryPeriod, updateErr := c.machineStatusUpdate( + ctx, + machine, + v1alpha1.LastOperation{ + Description: fmt.Sprintf("Provider error: %s. %s", err.Error(), machineutils.InstanceInitialization), + ErrorCode: errStatus.Code().String(), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + }, + v1alpha1.CurrentStatus{ + Phase: c.getCreateFailurePhase(machine), + LastUpdateTime: metav1.Now(), + }, + machine.Status.LastKnownState, + ) + + if updateErr != nil { + return updateRetryPeriod, updateErr + } + + return machineutils.ShortRetry, err + } + + klog.V(3).Infof("VM instance %q for machine %q was initialized", resp.ProviderID, machine.Name) + return 0, nil +} + func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( machine = deleteMachineRequest.Machine diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 9bfd4bf81..49f448865 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -508,6 +508,9 @@ var _ = Describe("machine", func() { } else { Expect(actual.Status.LastOperation.ErrorCode).To(Equal("")) } + if data.expect.machine.Status.LastOperation.Description != "" { + Expect(actual.Status.LastOperation.Description).To(Equal(data.expect.machine.Status.LastOperation.Description)) + } }, Entry("Machine creation succeeds with object UPDATE", &data{ @@ -1029,6 +1032,70 @@ var _ = Describe("machine", func() { retry: machineutils.ShortRetry, }, }), + Entry("Machine initialization failed due to VM instance initialization error", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Data: map[string][]byte{"userData": []byte("test")}, + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines(1, &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil, true, metav1.Now()), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeNode-0", + }, + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: status.Error(codes.Uninitialized, "VM instance could not be initialized"), + }, + }, + expect: expect{ + machine: newMachine(&v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machineClass", + }, + }, + }, &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineCrashLoopBackOff, + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Provider error: %s. %s", status.Error(codes.Uninitialized, "VM instance could not be initialized").Error(), machineutils.InstanceInitialization), + ErrorCode: codes.Uninitialized.String(), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationCreate, + }, + }, nil, nil, nil, true, metav1.Now()), + err: status.Error(codes.Uninitialized, "VM instance could not be initialized"), + retry: machineutils.ShortRetry, + }, + }), /* Entry("Machine creation success even on temporary APIServer disruption", &data{ setup: setup{ diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 3d91e1a68..8b2f679c7 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -27,6 +27,9 @@ const ( // GetVMStatus sets machine status to terminating and specifies next step as getting VMs GetVMStatus = "Set machine status to termination. Now, getting VM Status" + // InstanceInitialization is a step that represents initialization of a VM instance (post-creation). + InstanceInitialization = "Initialize VM Instance" + // InitiateDrain specifies next step as initiate node drain InitiateDrain = "Initiate node drain"