Skip to content

Commit

Permalink
MGMT-4966 Display a better message when media disconnection failure o…
Browse files Browse the repository at this point in the history
…ccurs

When a media disconnection error occurred the system became unstable and the step execution might be stuck on IO errors or failed with an unfriendly message.
Media disconnection occurred when a network drive (mostly PXE) has any errors (Network issue) and the controller disconnects it automatically.
Since this error is common to all the steps, we add this verification to the agent
The indication to error is reading from the file failed
  • Loading branch information
Michael Levy committed May 23, 2021
1 parent 72b14c8 commit ded14f7
Showing 1 changed file with 104 additions and 17 deletions.
121 changes: 104 additions & 17 deletions src/commands/step_processor.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package commands

import (
"fmt"
"io"
"net/http"
"os"
"strings"
"time"

"github.com/pkg/errors"

"github.com/go-openapi/swag"

log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -33,32 +39,26 @@ func newSession() *stepSession {
return &ret
}

func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, errStr string, exitCode int) {
func (s *stepSession) sendStepReply(reply models.StepReply) {
logFunc := s.Logger().Infof
if exitCode != 0 {
if reply.ExitCode != 0 {
logFunc = s.Logger().Warnf
}

if stepType == models.StepTypeFreeNetworkAddresses {
if reply.StepType == models.StepTypeFreeNetworkAddresses {
// the free-addresses step's output spams the log too much
logFunc("Sending step <%s> reply error <%s> exit-code <%d>", stepID, errStr, exitCode)
logFunc("Sending step <%s> reply error <%s> exit-code <%d>", reply.StepID, reply.Error, reply.ExitCode)
} else {
logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", stepID, output, errStr, exitCode)
logFunc("Sending step <%s> reply output <%s> error <%s> exit-code <%d>", reply.StepID, reply.Output, reply.Error, reply.ExitCode)
}

params := installer.PostStepReplyParams{
HostID: strfmt.UUID(config.GlobalAgentConfig.HostID),
ClusterID: strfmt.UUID(config.GlobalAgentConfig.ClusterID),
DiscoveryAgentVersion: &config.GlobalAgentConfig.AgentVersion,
Reply: &reply,
}
reply := models.StepReply{
StepType: stepType,
StepID: stepID,
ExitCode: int64(exitCode),
Output: output,
Error: errStr,
}
params.Reply = &reply

_, err := s.Client().Installer.PostStepReply(s.Context(), &params)
if err != nil {
switch errValue := err.(type) {
Expand All @@ -74,7 +74,17 @@ func (s *stepSession) sendStepReply(stepType models.StepType, stepID, output, er
}
}

func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) {
func (s *stepSession) createStepReply(stepType models.StepType, stepID string, output string, errStr string, exitCode int) models.StepReply {
return models.StepReply{
StepType: stepType,
StepID: stepID,
ExitCode: int64(exitCode),
Output: output,
Error: errStr,
}
}

func (s *stepSession) handleSingleStep(stepType models.StepType, stepID string, command string, args []string, handler HandlerType) models.StepReply {
s.Logger().Infof("Executing step: <%s>, command: <%s>, args: <%v>", stepID, command, args)
stdout, stderr, exitCode := handler(command, args...)
if exitCode != 0 {
Expand All @@ -86,21 +96,98 @@ stderr:
%v
`, exitCode, stepID, command, args, stdout, stderr)
}
s.sendStepReply(stepType, stepID, stdout, stderr, exitCode)

return s.createStepReply(stepType, stepID, stdout, stderr, exitCode)
}

func (s *stepSession) handleSteps(steps *models.Steps) {
for _, step := range steps.Instructions {
if step.Command == "" {
errStr := "Missing command"
s.Logger().Warn(errStr)
s.sendStepReply(step.StepType, step.StepID, "", errStr, -1)
s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", errStr, -1))
continue
}
go s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged)

go func(step *models.Step) {
if err := s.diagnoseSystem(); err != nil {
s.sendStepReply(s.createStepReply(step.StepType, step.StepID, "", err.Error(), -1))
return
}

reply := s.handleSingleStep(step.StepType, step.StepID, step.Command, step.Args, util.ExecutePrivileged)

if err := s.diagnoseSystem(); err != nil {
reply.ExitCode = -1

if reply.Error != "" {
reply.Error = fmt.Errorf("%w: %s", err, reply.Error).Error()
} else {
reply.Error = err.Error()
}
}

s.sendStepReply(reply)
}(step)
}
}

// diagnoseSystem runs quick validations that need to need to occur before and after each step.
// This is in order to detect and report known problems otherwise manifest as confusing error messages or stuck the whole system in the steps themselves.
// One common example of that is virtual media disconnection.
func (s *stepSession) diagnoseSystem() error {
mediaPath := "/run/media/iso"

// Media disconnection issue occurs only for a full-ISO installation
// mostly when serving iso via virtual media via sub optimal networking conditions.
// The minimal-ISO loaded very early and stay in memory. We don't need to read them from the ISO once they're loaded
// The media path exists only for the full-ISO so we can just eliminate this check.
if _, err := os.Stat(mediaPath); err != nil {
return nil
}

stdout, stderr, exitCode := util.ExecutePrivileged("findmnt", "--raw", "--noheadings", "--output", "SOURCE,TARGET", "--target", mediaPath)

if exitCode != 0 {
log.Error(fmt.Errorf("failed to validate media disconnection - continuing: %w", errors.New(stderr)))
return nil
}

if stdout == "" {
s.Logger().Warn("failed to validate media disconnection - continuing: cannot find ISO mountpoint source")
return nil
}

fields := strings.Fields(stdout)

if fields[1] != mediaPath {
s.Logger().Warnf("failed to validate media disconnection - continuing: media mounted to %s instead of directly to %s", fields[1], mediaPath)
return nil
}

source := fields[0]
if source == "" || !strings.HasPrefix(source, "/dev") {
s.Logger().Warnf("failed to validate media disconnection - continuing: the mount source isn't a device file %s", source)
return nil
}

r, err := os.Open(source)
if err != nil {
return err
}

defer r.Close()

_, err = io.ReadFull(r, make([]byte,2, 2))
if err != nil {
err := fmt.Errorf("cannot access the media(ISO) probably media was disconnected: %w", errors.New(stderr))
log.Error(err.Error())
return err
}

return nil
}

func (s *stepSession) processSingleSession() (int64, string) {
params := installer.GetNextStepsParams{
HostID: strfmt.UUID(config.GlobalAgentConfig.HostID),
Expand Down

0 comments on commit ded14f7

Please sign in to comment.