From 6b74403acae4c07e51c5d750049c75dd7eea2964 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 5 May 2021 15:10:03 +0300 Subject: [PATCH] MGMT-4966 Display a better message when media disconnection failure occurs When a media disconnection error occurred the system became unstable and the step execution might be stuck on IO errors or failed with an unfriendly message. Media disconnection occurred when a network drive (mostly PXE) has any errors (Network issue) and the controller disconnects it automatically. Since this error is common to all the steps, we add this verification to the agent The indication to error is reading from the file failed --- src/commands/step_processor.go | 146 ++++++++++++++++++++++++++++----- 1 file changed, 125 insertions(+), 21 deletions(-) diff --git a/src/commands/step_processor.go b/src/commands/step_processor.go index 51724fde7..e43c63f62 100644 --- a/src/commands/step_processor.go +++ b/src/commands/step_processor.go @@ -1,9 +1,15 @@ package commands import ( + "fmt" + "io" "net/http" + "os" + "strings" "time" + "github.com/pkg/errors" + "github.com/go-openapi/swag" log "github.com/sirupsen/logrus" @@ -20,6 +26,14 @@ import ( type HandlerType func(string, ...string) (stdout string, stderr string, exitCode int) +type errorCode int + +// TODO - add an ErrorCode Enum to the swagger +const ( + Undetected errorCode = 999 + MediaDisconnected errorCode = 100 +) + type stepSession struct { session.InventorySession } @@ -33,36 +47,30 @@ func newSession() *stepSession { return &ret } -func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, errStr string, exitCode int) { - if exitCode == 0 && alreadyExistsInService(stepType, output) { - s.Logger().Infof("Result for %s already exists in assisted service", string(stepType)) +func (s *stepSession) sendStepReply(reply models.StepReply) { + if reply.ExitCode == 0 && alreadyExistsInService(reply.StepType, reply.Output) { + s.Logger().Infof("Result for %s already exists in assisted service", string(reply.StepType)) return } logFunc := s.Logger().Infof - if exitCode != 0 { + if reply.ExitCode != 0 { logFunc = s.Logger().Warnf } - if stepType == models.StepTypeFreeNetworkAddresses { + if reply.StepType == models.StepTypeFreeNetworkAddresses { // the free-addresses step's output spams the log too much - logFunc("Sending step <%s> reply error <%s> exit-code <%d>", stepID, errStr, exitCode) + logFunc("Sending step <%s> reply error <%s> exit-code <%d>", reply.StepID, reply.Error, reply.ExitCode) } else { - logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", stepID, output, errStr, exitCode) + logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", reply.StepID, reply.Output, reply.Error, reply.ExitCode) } params := installer.PostStepReplyParams{ HostID: strfmt.UUID(config.GlobalAgentConfig.HostID), ClusterID: strfmt.UUID(config.GlobalAgentConfig.ClusterID), DiscoveryAgentVersion: &config.GlobalAgentConfig.AgentVersion, + Reply: &reply, } - reply := models.StepReply{ - StepType: stepType, - StepID: stepID, - ExitCode: int64(exitCode), - Output: output, - Error: errStr, - } - params.Reply = &reply + _, err := s.Client().Installer.PostStepReply(s.Context(), ¶ms) if err != nil { switch errValue := err.(type) { @@ -75,12 +83,22 @@ func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, er default: s.Logger().WithError(err).Warn("Error posting step reply") } - } else if exitCode == 0 { - storeInCache(stepType, output) + } else if reply.ExitCode == 0 { + storeInCache(reply.StepType, reply.Output) } } -func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) { +func (s *stepSession) createStepReply(stepType models.StepType, stepID string, output string, errStr string, exitCode int) models.StepReply { + return models.StepReply{ + StepType: stepType, + StepID: stepID, + ExitCode: int64(exitCode), + Output: output, + Error: errStr, + } +} + +func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) models.StepReply { s.Logger().Infof("Executing step: <%s>, command: <%s>, args: <%v>", stepID, command, args) stdout, stderr, exitCode := handler(command, args...) if exitCode != 0 { @@ -92,7 +110,8 @@ stderr: %v `, exitCode, stepID, command, args, stdout, stderr) } - s.sendStepReply(stepType, stepID, stdout, stderr, exitCode) + + return s.createStepReply(stepType, stepID, stdout, stderr, exitCode) } func (s *stepSession) handleSteps(steps *models.Steps) { @@ -100,13 +119,98 @@ func (s *stepSession) handleSteps(steps *models.Steps) { if step.Command == "" { errStr := "Missing command" s.Logger().Warn(errStr) - s.sendStepReply(step.StepType, step.StepID, "", errStr, -1) + s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", errStr, -1)) continue } - go s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged) + + go func(step *models.Step) { + if code, err := s.diagnoseSystem(); code != Undetected { + log.Errorf("System issue detected before running step: <%s>, command: <%s>, args: <%v>: %s - stopping the execution", step.StepID, step.Command, step.Args, err.Error()) + s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", err.Error(), int(code))) + return + } + + reply := s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged) + + if reply.ExitCode != 0 { + if code, err := s.diagnoseSystem(); code != Undetected { + log.Errorf("System issue detected after running step: <%s>, command: <%s>, args: <%v>: %s", step.StepID, step.Command, step.Args, err.Error()) + reply.ExitCode = int64(code) + } + } + + s.sendStepReply(reply) + }(step) } } +// diagnoseSystem runs quick validations that need to need to occur before step and after a failure. +// This is in order to detect and report known problems otherwise manifest as confusing error messages or stuck the whole system in the steps themselves. +// One common example of that is virtual media disconnection. +func (s *stepSession) diagnoseSystem() (errorCode, error) { + source, err := s.getMountpointSourceDeviceFile() + + if err != nil { + log.Warn(err) + return Undetected, nil + } + + if source == "" { + return Undetected, nil + } + + file, err := os.Open(source) + if err != nil { + return MediaDisconnected, errors.Wrap(err, "cannot access the media (ISO) - media was likely disconnected") + } + + defer file.Close() + + _, err = io.ReadFull(file, make([]byte, 2, 2)) + if err != nil { + return MediaDisconnected, errors.Wrap(err, "cannot read from the media (ISO) - media was likely disconnected") + } + + return Undetected, nil +} + +func (s *stepSession) getMountpointSourceDeviceFile() (string, error) { + mediaPath := "/run/media/iso" + + // Media disconnection issue occurs only for a full-ISO installation + // mostly when serving iso via virtual media via sub optimal networking conditions. + // The minimal-ISO loaded very early and stay in memory. We don't need to read them from the ISO once they're loaded + // The media path exists only for the full-ISO so we can just eliminate this check. + if _, err := os.Stat(mediaPath); err != nil { + return "", nil + } + + stdout, stderr, exitCode := util.ExecutePrivileged("findmnt", "--raw", "--noheadings", "--output", "SOURCE,TARGET", "--target", mediaPath) + + errorMessage := "failed to validate media disconnection - continuing" + + if exitCode != 0 { + return "", errors.Errorf("%s: %s", errorMessage, stderr) + } + + if stdout == "" { + return "", errors.Errorf("%s: cannot find ISO mountpoint source", errorMessage) + } + + fields := strings.Fields(stdout) + + if fields[1] != mediaPath { + return "", fmt.Errorf("%s: media mounted to %s instead of directly to %s", errorMessage, fields[1], mediaPath) + } + + source := fields[0] + if source == "" || !strings.HasPrefix(source, "/dev") { + return "", fmt.Errorf("%s: the mount source isn't a device file %s", errorMessage, source) + } + + return source, nil +} + func (s *stepSession) processSingleSession() (int64, string) { params := installer.GetNextStepsParams{ HostID: strfmt.UUID(config.GlobalAgentConfig.HostID),