Skip to content

Commit

Permalink
🌱 provider better err msg if rescue system can't be reached.
Browse files Browse the repository at this point in the history
  • Loading branch information
guettli committed Apr 25, 2024
1 parent 21601ab commit 3aa8cd2
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 51 deletions.
20 changes: 5 additions & 15 deletions api/v1beta1/hetznerbaremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,13 @@ const (
type RebootType string

const (
// RebootTypeHardware defines the hardware reboot.
RebootTypeHardware RebootType = "hw"
// RebootTypePower defines the power reboot.
// RebootTypePower defines the power reboot. "Press power button of server".
RebootTypePower RebootType = "power"
// RebootTypeSoftware defines the software reboot.
// RebootTypeSoftware defines the software reboot. "Send CTRL+ALT+DEL to the server".
RebootTypeSoftware RebootType = "sw"
// RebootTypeManual defines the manual reboot.
// RebootTypeHardware defines the hardware reboot. "Execute an automatic hardware reset".
RebootTypeHardware RebootType = "hw"
// RebootTypeManual defines the manual reboot. "Order a manual power cycle".
RebootTypeManual RebootType = "man"
)

Expand Down Expand Up @@ -518,16 +518,6 @@ func (host *HetznerBareMetalHost) HasHardwareReboot() bool {
return false
}

// HasPowerReboot returns a boolean indicating whether power reboot exists for the server.
func (host *HetznerBareMetalHost) HasPowerReboot() bool {
for _, rt := range host.Spec.Status.RebootTypes {
if rt == RebootTypePower {
return true
}
}
return false
}

// NeedsProvisioning compares the settings with the provisioning
// status and returns true when more work is needed or false
// otherwise.
Expand Down
27 changes: 0 additions & 27 deletions api/v1beta1/hetznerbaremetalhost_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,33 +232,6 @@ var _ = Describe("Test HasHardwareReboot", func() {
)
})

var _ = Describe("Test HasPowerReboot", func() {
type testCaseHasPowerReboot struct {
rebootTypes []RebootType
expectBool bool
}

DescribeTable("Test HasPowerReboot",
func(tc testCaseHasPowerReboot) {
host := HetznerBareMetalHost{}
host.Spec.Status.RebootTypes = tc.rebootTypes
Expect(host.HasPowerReboot()).Should(Equal(tc.expectBool))
},
Entry("has power reboot - single reboot type", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypePower},
expectBool: true,
}),
Entry("has power reboot - multiple reboot types", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypeSoftware, RebootTypePower},
expectBool: true,
}),
Entry("has no power reboot", testCaseHasPowerReboot{
rebootTypes: []RebootType{RebootTypeSoftware, RebootTypeManual},
expectBool: false,
}),
)
})

var _ = Describe("Test NeedsProvisioning", func() {
type testCaseNeedsProvisioning struct {
installImage *InstallImage
Expand Down
1 change: 1 addition & 0 deletions hack/filter-caph-controller-manager-logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'Update to resource only changes insignificant fields',
'"approved csr"',
'"Registering webhook"',
"'statusCode': 200, 'method': 'GET', 'url': 'https://robot-ws.your-server.de",
]

def main():
Expand Down
55 changes: 47 additions & 8 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,11 @@ func (s *Service) actionPreparing() actionResult {
if err := handleSSHError(sshClient.Reboot()); err != nil {
return actionError{err: fmt.Errorf("failed to reboot server via ssh: %w", err)}
}

msg := fmt.Sprintf("Phase %s: Rebooting via ssh into rescue mode.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, "ssh reboot triggered")
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeSSHRebootTriggered, msg)
return actionComplete{}
}

Expand All @@ -239,8 +241,11 @@ func (s *Service) actionPreparing() actionResult {
return actionError{err: fmt.Errorf(errMsgFailedReboot, err)}
}

msg := fmt.Sprintf("Phase %s: Reboot into rescue system via rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootType)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(errorType, "software/hardware reboot triggered")
s.scope.HetznerBareMetalHost.SetError(errorType, msg)
return actionComplete{}
}

Expand Down Expand Up @@ -332,6 +337,9 @@ func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnecti
// if error has occurred before, check the timeout
if hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, time.Minute) {
msg := "Connection error when targeting server with ssh that might be due to a wrong ssh port. Please check."
if isRebootIntoRescue {
msg = "Connection error. Can't reach rescue system via ssh."
}
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
Expand Down Expand Up @@ -389,6 +397,10 @@ func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
// If the timeout for ssh reboots has been reached, then escalate.
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
if !isSSHTimeoutError || hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, sshResetTimeout) {
if wantsRescue {
// make sure hat we boot into rescue mode if that is necessary
Expand All @@ -404,9 +416,11 @@ func (s *Service) handleErrorTypeSSHRebootFailed(isSSHTimeoutError, wantsRescue
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}

msg := fmt.Sprintf("Phase %s: Reboot via ssh into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, rebootType)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(errorType, "ssh reboot timed out")
s.scope.HetznerBareMetalHost.SetError(errorType, msg)
}
return nil
}
Expand All @@ -429,6 +443,10 @@ func rebootAndErrorTypeAfterTimeout(host *infrav1.HetznerBareMetalHost) (infrav1
}

func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error {
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
// If it is not a timeout error, then the ssh command (get hostname) worked, but didn't give us the
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
Expand All @@ -445,15 +463,21 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}

msg := fmt.Sprintf("Phase %s: Reboot via type 'software' into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, infrav1.RebootTypeHardware)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
// we immediately set an error message in the host status to track the reboot we just performed
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeHardwareRebootTriggered, "software reboot timed out")
s.scope.HetznerBareMetalHost.SetError(infrav1.ErrorTypeHardwareRebootTriggered, msg)
}

return nil
}

func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error {
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
}
// If it is not a timeout error, then the ssh command (get hostname) worked, but didn't give us the
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
Expand All @@ -475,6 +499,9 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
}
msg := fmt.Sprintf("Phase %s: Reboot via ssh into %s failed. Now using rebootType %q.",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState, rebootInto, infrav1.RebootTypeHardware)
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", msg)
}
return nil
}
Expand Down Expand Up @@ -537,6 +564,12 @@ func (s *Service) actionRegistering() actionResult {
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
}
timeSinceReboot := "unknown"
if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil {
timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String()
}
s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "stdout", out.StdOut, "stderr", out.StdErr, "err", out.Err.Error(),
"isSSHTimeoutError", isSSHTimeoutError, "isSSHConnectionRefusedError", isSSHConnectionRefusedError, "timeSinceReboot", timeSinceReboot)
return actionContinue{delay: 10 * time.Second}
}

Expand Down Expand Up @@ -708,7 +741,7 @@ func (s *Service) analyzeSSHErrorRegistering(sshErr error) (isSSHTimeoutError, i
}
reterr = fmt.Errorf("wrong ssh key: %w", sshErr)
case sshclient.IsConnectionRefusedError(sshErr):
if !rebootTriggered {
if !rebootTriggered && s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.ErrorTypeHardwareRebootTriggered {
// Reboot did not trigger
return false, false, nil
}
Expand Down Expand Up @@ -1115,6 +1148,9 @@ func (s *Service) actionImageInstalling() actionResult {
record.Warn(s.scope.HetznerBareMetalHost, "RebootFailed", err.Error())
return actionError{err: fmt.Errorf("failed to reboot server: %w", err)}
}
record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", "Phase %s: reboot via ssh into node (after machine image was installed)",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)

s.scope.Logger.Info("RebootAfterInstallimageSucceeded", "stdout", out.StdOut, "stderr", out.StdErr)

// clear potential errors - all done
Expand Down Expand Up @@ -1661,6 +1697,9 @@ func (s *Service) actionProvisioned() actionResult {
if err := handleSSHError(out); err != nil {
return actionError{err: err}
}

record.Eventf(s.scope.HetznerBareMetalHost, "RebootBMServer", "Phase %s: rebooting via type ssh (because reboot annotation was set).",
s.scope.HetznerBareMetalHost.Spec.Status.ProvisioningState)
s.scope.HetznerBareMetalHost.Spec.Status.Rebooted = true
return actionContinue{delay: 10 * time.Second}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/services/baremetal/host/host_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() {
err: sshclient.ErrConnectionRefused,
rescueActive: true,
expectedIsTimeout: false,
expectedIsConnectionRefused: false,
expectedIsConnectionRefused: true,
expectedErrMessage: "",
}),
Entry("connectionRefused error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{
Expand Down

0 comments on commit 3aa8cd2

Please sign in to comment.