Skip to content

Commit

Permalink
Merge pull request #2041 from iurygregory/hup
Browse files Browse the repository at this point in the history
✨ Add reference to HostUpdatePolicy in Servicing with HFS Support
  • Loading branch information
metal3-io-bot authored Nov 8, 2024
2 parents 08401af + 74bb269 commit af93711
Show file tree
Hide file tree
Showing 14 changed files with 663 additions and 51 deletions.
11 changes: 9 additions & 2 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ const (
// OperationalStatusDetached is the status value when the host is
// marked unmanaged via the detached annotation.
OperationalStatusDetached OperationalStatus = "detached"

// OperationalStatusServicing is the status value when the host is
// undergoing servicing (e.g. checking firmware settings).
OperationalStatusServicing OperationalStatus = "servicing"
)

// OperationalStatusAllowed represents the allowed values of OperationalStatus.
Expand Down Expand Up @@ -179,6 +183,9 @@ const (
// DetachError is an error condition occurring when the
// controller is unable to detatch the host from the provisioner.
DetachError ErrorType = "detach error"
// ServicingError is an error condition occurring when
// service steps failed.
ServicingError ErrorType = "servicing error"
)

// ErrorTypeAllowed represents the allowed values of ErrorType.
Expand Down Expand Up @@ -800,12 +807,12 @@ type BareMetalHostStatus struct {
// after modifying this file

// OperationalStatus holds the status of the host
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached;servicing
OperationalStatus OperationalStatus `json:"operationalStatus"`

// ErrorType indicates the type of failure encountered when the
// OperationalStatus is OperationalStatusError
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error;servicing error
ErrorType ErrorType `json:"errorType,omitempty"`

// LastUpdated identifies when this status was last observed.
Expand Down
2 changes: 2 additions & 0 deletions config/base/crds/bases/metal3.io_baremetalhosts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: The last credentials we were able to validate as working.
Expand Down Expand Up @@ -897,6 +898,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: |-
Expand Down
2 changes: 2 additions & 0 deletions config/render/capm3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: The last credentials we were able to validate as working.
Expand Down Expand Up @@ -897,6 +898,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: |-
Expand Down
140 changes: 124 additions & 16 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3api.ErrorType, err
metal3api.InspectionError: "InspectionError",
metal3api.ProvisioningError: "ProvisioningError",
metal3api.PowerManagementError: "PowerManagementError",
metal3api.PreparationError: "PreparationError",
metal3api.ServicingError: "ServicingError",
}[errorType]

counter := actionFailureCounters.WithLabelValues(eventType)
Expand Down Expand Up @@ -464,9 +466,9 @@ func hasInspectAnnotation(host *metal3api.BareMetalHost) bool {
return false
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
dirty = host.SetOperationalStatus(metal3api.OperationalStatusOK)
// clearErrorWithStatus removes any existing error message and sets operational status.
func clearErrorWithStatus(host *metal3api.BareMetalHost, status metal3api.OperationalStatus) (dirty bool) {
dirty = host.SetOperationalStatus(status)
var emptyErrType metal3api.ErrorType
if host.Status.ErrorType != emptyErrType {
host.Status.ErrorType = emptyErrType
Expand All @@ -479,6 +481,11 @@ func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return dirty
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return clearErrorWithStatus(host, metal3api.OperationalStatusOK)
}

// setErrorMessage updates the ErrorMessage in the host Status struct
// and increases the ErrorCount.
func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, message string) {
Expand Down Expand Up @@ -886,7 +893,6 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf

// Create the hostFirmwareSettings resource with same host name/namespace if it doesn't exist
// Create the hostFirmwareComponents resource with same host name/namespace if it doesn't exist
// Set owner reference on hostUpdatePolicy resource if not set
if info.host.Name != "" {
if !info.host.DeletionTimestamp.IsZero() {
info.log.Info(fmt.Sprintf("will not attempt to create new hostFirmwareSettings and hostFirmwareComponents in %s", info.host.Namespace))
Expand All @@ -899,7 +905,7 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf
info.log.Info("failed creating hostfirmwarecomponents")
return actionError{errors.Wrap(err, "failed creating hostFirmwareComponents")}
}
if err = r.hostUpdatePolicySetOwnerReference(info); err != nil {
if _, err = r.acquireHostUpdatePolicy(info); err != nil {
info.log.Info("failed setting owner reference on hostupdatepolicy")
return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")}
}
Expand Down Expand Up @@ -1359,6 +1365,90 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio
return actionComplete{}
}

func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult) {
servicingData := provisioner.ServicingData{}

// (NOTE)janders: since Servicing is an opt-in feature that requires HostUpdatePolicy to be created and set to onReboot
// set below booleans to false by default and change to true based on policy settings

var fwDirty bool
var hfsDirty bool
var liveFirmwareSettingsAllowed bool

if hup != nil {
liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot)
}

if liveFirmwareSettingsAllowed {
// handling pre-HFS FirmwareSettings here
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
servicingData.FirmwareConfig = info.host.Spec.Firmware
fwDirty = true
}
// handling HFS based FirmwareSettings here
var hfs *metal3api.HostFirmwareSettings
var err error
hfsDirty, hfs, err = r.getHostFirmwareSettings(info)
if err != nil {
return actionError{fmt.Errorf("could not determine updated settings: %w", err)}
}
if hfsDirty {
servicingData.ActualFirmwareSettings = hfs.Status.Settings
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}
}

hasChanges := fwDirty || hfsDirty

// Even if settings are clean, we need to check the result of the current servicing.
if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError {
// If nothing is going on, return control to the power management.
return nil
}

// FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually
// succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the
// update didn't actually happen. This is deemed an acceptable risk for the moment since it is only
// going to impact a small subset of Firmware Settings implementations.
if info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing {
info.host.Status.OperationalStatus = metal3api.OperationalStatusServicing
return actionUpdate{}
}

provResult, started, err := prov.Service(servicingData, hasChanges,
info.host.Status.ErrorType == metal3api.ServicingError)
if err != nil {
return actionError{fmt.Errorf("error servicing host: %w", err)}
}
if provResult.ErrorMessage != "" {
info.host.Status.Provisioning.Firmware = nil
result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage)
return result
}

dirty := clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing)

if started && fwDirty {
info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy()
dirty = true
}

resultAction := actionContinue{delay: provResult.RequeueAfter}
if dirty {
return actionUpdate{resultAction}
} else if provResult.Dirty {
return resultAction
}

// Servicing is finished at this point, clean up operational status
if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) {
// FIXME(janders/dtantsur): this can be racy. We should consider
// using a generation number to decide if we start servicing or not.
return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}
}
return nil
}

// Check the current power status against the desired power status.
func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
var provResult provisioner.Result
Expand All @@ -1372,12 +1462,20 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn {
info.log.Info("updating power status", "discovered", *hwState.PoweredOn)
info.host.Status.PoweredOn = *hwState.PoweredOn
clearError(info.host)
if info.host.Status.OperationalStatus == metal3api.OperationalStatusError && info.host.Status.ErrorType == metal3api.PowerManagementError {
clearError(info.host)
}
return actionUpdate{}
}

desiredPowerOnState := info.host.Spec.Online

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned
// FIXME(janders/dtantsur) it would be preferrable to pass in state as an argument
// however this falls outside the scope of this specific change.

if !info.host.Status.PoweredOn {
if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists {
delete(info.host.Annotations, metal3api.RebootAnnotationPrefix)
Expand All @@ -1390,9 +1488,19 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
}
}

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned
servicingAllowed := isProvisioned && !info.host.Status.PoweredOn && desiredPowerOnState
if servicingAllowed || info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || info.host.Status.ErrorType == metal3api.ServicingError {
hup, err := r.acquireHostUpdatePolicy(info)
if err != nil {
info.log.Info("failed setting owner reference on hostupdatepolicy")
return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")}
}

result := r.doServiceIfNeeded(prov, info, hup)
if result != nil {
return result
}
}

desiredReboot, desiredRebootMode := hasRebootAnnotation(info, !isProvisioned)

Expand Down Expand Up @@ -1866,7 +1974,7 @@ func (r *BareMetalHostReconciler) createHostFirmwareSettings(info *reconcileInfo
return nil
}

func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconcileInfo) error {
func (r *BareMetalHostReconciler) acquireHostUpdatePolicy(info *reconcileInfo) (policy *metal3api.HostUpdatePolicy, err error) {
// NOTE(janders) the goal here is to ensure that the controller reads the hup resource and adds OwnerReference to it
hup := &metal3api.HostUpdatePolicy{}
if err := r.Get(info.ctx, info.request.NamespacedName, hup); err != nil {
Expand All @@ -1876,23 +1984,23 @@ func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconc
// garbage collected. For additional cleanup logic use
// finalizers. Return and don't requeue

return nil
return nil, nil
}
// Error reading the object
return fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err)
return nil, fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err)
}
if !ownerReferenceExists(info.host, hup) {
if err := controllerutil.SetOwnerReference(info.host, hup, r.Scheme()); err != nil {
return fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err)
return hup, fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err)
}
if err := r.Update(info.ctx, hup); err != nil {
return fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err)
return hup, fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err)
}

return nil
return hup, nil
}

return nil
return hup, nil
}

// Get the stored firmware settings if there are valid changes.
Expand Down
Loading

0 comments on commit af93711

Please sign in to comment.