Skip to content

Commit

Permalink
🌱 handle case where rescue system can't be reached. (#1220)
Browse files Browse the repository at this point in the history
Provide better err msg if rescue system can't be reached. Handle this case better, because currently it reaches and endless loop of hardware reboots if a bare metal server's rescue system is not reachable.

Additionally, the controller emits an Event for every reboot now. This makes it more clear what is happening.
  • Loading branch information
guettli authored May 8, 2024
1 parent 59d4000 commit a561e3c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 50 deletions.
20 changes: 5 additions & 15 deletions api/v1beta1/hetznerbaremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,13 @@ const (
type RebootType string

const (
// RebootTypeHardware defines the hardware reboot.
RebootTypeHardware RebootType = "hw"
// RebootTypePower defines the power reboot.
// RebootTypePower defines the power reboot. "Press power button of server".
RebootTypePower RebootType = "power"
// RebootTypeSoftware defines the software reboot.
// RebootTypeSoftware defines the software reboot. "Send CTRL+ALT+DEL to the server".
RebootTypeSoftware RebootType = "sw"
// RebootTypeManual defines the manual reboot.
// RebootTypeHardware defines the hardware reboot. "Execute an automatic hardware reset".
RebootTypeHardware RebootType = "hw"
// RebootTypeManual defines the manual reboot. "Order a manual power cycle".
RebootTypeManual RebootType = "man"
)

Expand Down Expand Up @@ -518,16 +518,6 @@ func (host *HetznerBareMetalHost) HasHardwareReboot() bool {
return false
}

// HasPowerReboot returns a boolean indicating whether power reboot exists for the server.
func (host *HetznerBareMetalHost) HasPowerReboot() bool {
for _, rt := range host.Spec.Status.RebootTypes {
if rt == RebootTypePower {
return true
}
}
return false
}

// NeedsProvisioning compares the settings with the provisioning
// status and returns true when more work is needed or false
// otherwise.
Expand Down
27 changes: 0 additions & 27 deletions api/v1beta1/hetznerbaremetalhost_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,33 +232,6 @@ var _ = Describe("Test HasHardwareReboot", func() {
)
})

var _ = Describe("Test HasPowerReboot", func() {
type testCaseHasPowerReboot struct {
rebootTypes []RebootType
expectBool bool
}

DescribeTable("Test HasPowerReboot",
func(tc testCaseHasPowerReboot) {
host := HetznerBareMetalHost{}
host.Spec.Status.RebootTypes = tc.rebootTypes
Expect(host.HasPowerReboot()).Should(Equal(tc.expectBool))
},
Entry("has power reboot - single reboot type", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypePower},
expectBool: true,
}),
Entry("has power reboot - multiple reboot types", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypeSoftware, RebootTypePower},
expectBool: true,
}),
Entry("has no power reboot", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypeSoftware, RebootTypeManual},
expectBool: false,
}),
)
})

var _ = Describe("Test NeedsProvisioning", func() {
type testCaseNeedsProvisioning struct {
installImage *InstallImage
Expand Down
1 change: 1 addition & 0 deletions hack/filter-caph-controller-manager-logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'Update to resource only changes insignificant fields',
'"approved csr"',
'"Registering webhook"',
"'statusCode': 200, 'method': 'GET', 'url': 'https://robot-ws.your-server.de",
]

def main():
Expand Down
55 changes: 47 additions & 8 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,11 @@ func (s *Service) actionPreparing() actionResult {
if err := handleSSHError(sshClient.Reboot()); err != nil {
return actionError{err: fmt.Errorf("failed to reboot server via ssh: %w", err)}
}

msg := fmt.Sprintf("Phase %s: Rebooting via ssh into rescue mode.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, "ssh reboot triggered")
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, msg)
return actionComplete{}
}

Expand All @@ -239,8 +241,11 @@ func (s *Service) actionPreparing() actionResult {
return actionError{err: fmt.Errorf(errMsgFailedReboot, err)}
}

msg := fmt.Sprintf("Phase %s: Reboot into rescue system via rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootType)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(errorType, "software/hardware reboot triggered")
s.scope.HetznerBareMetalHost.SetError(errorType, msg)
return actionComplete{}
}

Expand Down Expand Up @@ -332,6 +337,9 @@ func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnecti
// if error has occurred before, check the timeout
if hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, time.Minute) {
msg := "Connection error when targeting server with ssh that might be due to a wrong ssh port. Please check."
if isRebootIntoRescue {
msg = "Connection error. Can't reach rescue system via ssh."
}
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
Expand Down Expand Up @@ -389,6 +397,10 @@ func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
// If the timeout for ssh reboots has been reached, then escalate.
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
if !isSSHTimeoutError || hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, sshResetTimeout) {
if wantsRescue {
// make sure hat we boot into rescue mode if that is necessary
Expand All @@ -404,9 +416,11 @@ func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}

msg := fmt.Sprintf("Phase %s: Reboot via ssh into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, rebootType)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(errorType, "ssh reboot timed out")
s.scope.HetznerBareMetalHost.SetError(errorType, msg)
}
return nil
}
Expand All @@ -429,6 +443,10 @@ func rebootAndErrorTypeAfterTimeout(host *infrav1.HetznerBareMetalHost) (infrav1
}

func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error {
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
// If it is not a timeout error, then the ssh command (get hostname) worked, but didn't give us the
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
Expand All @@ -445,15 +463,21 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}

msg := fmt.Sprintf("Phase %s: Reboot via type 'software' into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, infrav1.RebootTypeHardware)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeHardwareRebootTriggered, "software reboot timed out")
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeHardwareRebootTriggered, msg)
}

return nil
}

func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error {
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
// If it is not a timeout error, then the ssh command (get hostname) worked, but didn't give us the
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
Expand All @@ -475,6 +499,9 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}
msg := fmt.Sprintf("Phase %s: Reboot via ssh into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, infrav1.RebootTypeHardware)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
}
return nil
}
Expand Down Expand Up @@ -537,6 +564,12 @@ func (s *Service) actionRegistering() actionResult {
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
}
timeSinceReboot := "unknown"
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
}
s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "stdout", out.StdOut, "stderr", out.StdErr, "err", out.Err.Error(),
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
return actionContinue{delay: 10 * time.Second}
}

Expand Down Expand Up @@ -708,7 +741,7 @@ func (s *Service) analyzeSSHErrorRegistering(sshErr error) (isSSHTimeoutError, i
}
reterr = fmt.Errorf("wrong ssh key: %w", sshErr)
case sshclient.IsConnectionRefusedError(sshErr):
if !rebootTriggered {
if !rebootTriggered && s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.ErrorTypeHardwareRebootTriggered {
// Reboot did not trigger
return false, false, nil
}
Expand Down Expand Up @@ -1115,6 +1148,9 @@ func (s *Service) actionImageInstalling() actionResult {
record.Warn(s.scope.HetznerBareMetalHost, "RebootFailed", err.Error())
return actionError{err: fmt.Errorf("failed to reboot server: %w", err)}
}
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", "Phase %s: reboot via ssh into node (after machine image was installed)",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)

s.scope.Logger.Info("RebootAfterInstallimageSucceeded", "stdout", out.StdOut, "stderr", out.StdErr)

// clear potential errors - all done
Expand Down Expand Up @@ -1660,6 +1696,9 @@ func (s *Service) actionProvisioned() actionResult {
if err := handleSSHError(out); err != nil {
return actionError{err: err}
}

record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", "Phase %s: rebooting via type ssh (because reboot annotation was set).",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)
s.scope.HetznerBareMetalHost.Spec.Status.Rebooted = true
return actionContinue{delay: 10 * time.Second}
}
Expand Down

0 comments on commit a561e3c

Please sign in to comment.