Skip to content

Commit

Permalink
MGMT-4966 Display a better message when media disconnection failure o…
Browse files Browse the repository at this point in the history
…ccurs

When a media disconnection error occurred the system became unstable and the step execution might be stuck on IO errors or failed with an unfriendly message.
Media disconnection occurred when a network drive (mostly PXE) has any errors (Network issue) and the controller disconnects it automatically.
Since this error is common to all the steps, we add this verification to the agent
The indication to error is reading from the file failed
  • Loading branch information
Michael Levy committed May 26, 2021
1 parent 0827f18 commit 6b74403
Showing 1 changed file with 125 additions and 21 deletions.
146 changes: 125 additions & 21 deletions src/commands/step_processor.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package commands

import (
"fmt"
"io"
"net/http"
"os"
"strings"
"time"

"github.com/pkg/errors"

"github.com/go-openapi/swag"

log "github.com/sirupsen/logrus"
Expand All @@ -20,6 +26,14 @@ import (

type HandlerType func(string, ...string) (stdout string, stderr string, exitCode int)

type errorCode int

// TODO - add an ErrorCode Enum to the swagger
const (
Undetected errorCode = 999
MediaDisconnected errorCode = 100
)

type stepSession struct {
session.InventorySession
}
Expand All @@ -33,36 +47,30 @@ func newSession() *stepSession {
return &ret
}

func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, errStr string, exitCode int) {
if exitCode == 0 && alreadyExistsInService(stepType, output) {
s.Logger().Infof("Result for %s already exists in assisted service", string(stepType))
func (s *stepSession) sendStepReply(reply models.StepReply) {
if reply.ExitCode == 0 && alreadyExistsInService(reply.StepType, reply.Output) {
s.Logger().Infof("Result for %s already exists in assisted service", string(reply.StepType))
return
}
logFunc := s.Logger().Infof
if exitCode != 0 {
if reply.ExitCode != 0 {
logFunc = s.Logger().Warnf
}

if stepType == models.StepTypeFreeNetworkAddresses {
if reply.StepType == models.StepTypeFreeNetworkAddresses {
// the free-addresses step's output spams the log too much
logFunc("Sending step <%s> reply error <%s> exit-code <%d>", stepID, errStr, exitCode)
logFunc("Sending step <%s> reply error <%s> exit-code <%d>", reply.StepID, reply.Error, reply.ExitCode)
} else {
logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", stepID, output, errStr, exitCode)
logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", reply.StepID, reply.Output, reply.Error, reply.ExitCode)
}

params := installer.PostStepReplyParams{
HostID: strfmt.UUID(config.GlobalAgentConfig.HostID),
ClusterID: strfmt.UUID(config.GlobalAgentConfig.ClusterID),
DiscoveryAgentVersion: &config.GlobalAgentConfig.AgentVersion,
Reply: &reply,
}
reply := models.StepReply{
StepType: stepType,
StepID: stepID,
ExitCode: int64(exitCode),
Output: output,
Error: errStr,
}
params.Reply = &reply

_, err := s.Client().Installer.PostStepReply(s.Context(), &params)
if err != nil {
switch errValue := err.(type) {
Expand All @@ -75,12 +83,22 @@ func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, er
default:
s.Logger().WithError(err).Warn("Error posting step reply")
}
} else if exitCode == 0 {
storeInCache(stepType, output)
} else if reply.ExitCode == 0 {
storeInCache(reply.StepType, reply.Output)
}
}

func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) {
func (s *stepSession) createStepReply(stepType models.StepType, stepID string, output string, errStr string, exitCode int) models.StepReply {
return models.StepReply{
StepType: stepType,
StepID: stepID,
ExitCode: int64(exitCode),
Output: output,
Error: errStr,
}
}

func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) models.StepReply {
s.Logger().Infof("Executing step: <%s>, command: <%s>, args: <%v>", stepID, command, args)
stdout, stderr, exitCode := handler(command, args...)
if exitCode != 0 {
Expand All @@ -92,21 +110,107 @@ stderr:
%v
`, exitCode, stepID, command, args, stdout, stderr)
}
s.sendStepReply(stepType, stepID, stdout, stderr, exitCode)

return s.createStepReply(stepType, stepID, stdout, stderr, exitCode)
}

func (s *stepSession) handleSteps(steps *models.Steps) {
for _, step := range steps.Instructions {
if step.Command == "" {
errStr := "Missing command"
s.Logger().Warn(errStr)
s.sendStepReply(step.StepType, step.StepID, "", errStr, -1)
s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", errStr, -1))
continue
}
go s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged)

go func(step *models.Step) {
if code, err := s.diagnoseSystem(); code != Undetected {
log.Errorf("System issue detected before running step: <%s>, command: <%s>, args: <%v>: %s - stopping the execution", step.StepID, step.Command, step.Args, err.Error())
s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", err.Error(), int(code)))
return
}

reply := s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged)

if reply.ExitCode != 0 {
if code, err := s.diagnoseSystem(); code != Undetected {
log.Errorf("System issue detected after running step: <%s>, command: <%s>, args: <%v>: %s", step.StepID, step.Command, step.Args, err.Error())
reply.ExitCode = int64(code)
}
}

s.sendStepReply(reply)
}(step)
}
}

// diagnoseSystem runs quick validations that need to need to occur before step and after a failure.
// This is in order to detect and report known problems otherwise manifest as confusing error messages or stuck the whole system in the steps themselves.
// One common example of that is virtual media disconnection.
func (s *stepSession) diagnoseSystem() (errorCode, error) {
source, err := s.getMountpointSourceDeviceFile()

if err != nil {
log.Warn(err)
return Undetected, nil
}

if source == "" {
return Undetected, nil
}

file, err := os.Open(source)
if err != nil {
return MediaDisconnected, errors.Wrap(err, "cannot access the media (ISO) - media was likely disconnected")
}

defer file.Close()

_, err = io.ReadFull(file, make([]byte, 2, 2))
if err != nil {
return MediaDisconnected, errors.Wrap(err, "cannot read from the media (ISO) - media was likely disconnected")
}

return Undetected, nil
}

func (s *stepSession) getMountpointSourceDeviceFile() (string, error) {
mediaPath := "/run/media/iso"

// Media disconnection issue occurs only for a full-ISO installation
// mostly when serving iso via virtual media via sub optimal networking conditions.
// The minimal-ISO loaded very early and stay in memory. We don't need to read them from the ISO once they're loaded
// The media path exists only for the full-ISO so we can just eliminate this check.
if _, err := os.Stat(mediaPath); err != nil {
return "", nil
}

stdout, stderr, exitCode := util.ExecutePrivileged("findmnt", "--raw", "--noheadings", "--output", "SOURCE,TARGET", "--target", mediaPath)

errorMessage := "failed to validate media disconnection - continuing"

if exitCode != 0 {
return "", errors.Errorf("%s: %s", errorMessage, stderr)
}

if stdout == "" {
return "", errors.Errorf("%s: cannot find ISO mountpoint source", errorMessage)
}

fields := strings.Fields(stdout)

if fields[1] != mediaPath {
return "", fmt.Errorf("%s: media mounted to %s instead of directly to %s", errorMessage, fields[1], mediaPath)
}

source := fields[0]
if source == "" || !strings.HasPrefix(source, "/dev") {
return "", fmt.Errorf("%s: the mount source isn't a device file %s", errorMessage, source)
}

return source, nil
}

func (s *stepSession) processSingleSession() (int64, string) {
params := installer.GetNextStepsParams{
HostID: strfmt.UUID(config.GlobalAgentConfig.HostID),
Expand Down

0 comments on commit 6b74403

Please sign in to comment.