Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the workflow of xfs devices filesystem check and mount #132

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 72 additions & 10 deletions mount/mount_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ const (
fsckErrorsCorrected = 1
// 'fsck' found errors but exited without correcting them
fsckErrorsUncorrected = 4
// 'xfs_repair' found errors but exited without correcting them
xfsRepairErrorsUncorrected = 1
// 'xfs_repair' was unable to proceed due to a dirty log
xfsRepairErrorsDirtyLogs = 2
// Retry 'xfs_repair' three times on failure to make more repairs on successive runs.
xfsRepairRetries = 3
)

// Mounter provides the default implementation of mount.Interface
Expand Down Expand Up @@ -257,7 +263,7 @@ func (mounter *Mounter) GetMountRefs(pathname string) ([]string, error) {
}

// checkAndRepairFileSystem checks and repairs filesystems using command fsck.
func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error {
func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source, target string) error {
klog.V(4).Infof("Checking for issues with fsck on disk: %s", source)
args := []string{"-a", source}
out, err := mounter.Exec.Command("fsck", args...).CombinedOutput()
Expand All @@ -278,7 +284,7 @@ func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error
}

// checkAndRepairXfsFilesystem checks and repairs xfs filesystem using command xfs_repair.
func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) error {
func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source, target string) error {
klog.V(4).Infof("Checking for issues with xfs_repair on disk: %s", source)

args := []string{source}
Expand All @@ -292,18 +298,74 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er
return nil
} else {
klog.Warningf("Filesystem corruption was detected for %s, running xfs_repair to repair", source)
out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput()
if err != nil {
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out)
} else {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
var uncorrectedErr error

// If xfs_repair fails to repair the file system successfully, try giving the same xfs_repair
// command twice more; xfs_repair may be able to make more repairs on successive runs.
for i := 0; i < xfsRepairRetries; i++ {
27149chen marked this conversation as resolved.
Show resolved Hide resolved
out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput()
if err == nil {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
27149chen marked this conversation as resolved.
Show resolved Hide resolved
}
e, isExitError := err.(utilexec.ExitError)
if !isExitError {
return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err)

}
switch e.ExitStatus() {
case xfsRepairErrorsUncorrected:
uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out))
// If the exit status is 2, do not retry, replay the dirty logs instead.
case xfsRepairErrorsDirtyLogs:
return mounter.replayXfsDirtyLogs(source, target)
default:
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out))
}
}
if uncorrectedErr != nil {
return uncorrectedErr
}
}
}
return nil
}

// replayXfsDirtyLogs tries to replay dirty logs by by mounting and
// immediately unmounting the filesystem on the same class of machine
// that crashed.
func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source, target string) error {
27149chen marked this conversation as resolved.
Show resolved Hide resolved
klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source)
msg := fmt.Sprintf("failed to replay dirty logs on device %s", source)

// make sure unmount is always called after mount.
err := func() (returnErr error) {
defer func() {
klog.V(4).Infof("Unmounting %s", target)
if err := mounter.Interface.Unmount(target); err != nil {
returnErr = NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err)
}
}()
klog.V(4).Infof("Attempting to mount disk %s at %s", source, target)
if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From man-page:

In this situation, the log can be replayed by mounting and
       immediately unmounting the filesystem on the same class of machine
       that crashed.  Please make sure that the machine's hardware is
       reliable before replaying to avoid compounding the problems.

Are we being too aggressive in automatically fixing errors here? Can this make problems worse somehow? Should this be configurable? If we merge this PR as it is - it will become the new default even for in-tree Kubernetes drivers, so we have to be careful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gnufied do you mean that the node might be bad or not the same class, so there is potential risk?

return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err)
}
return nil
}()
if err != nil {
return err
}

klog.V(4).Infof("Checking for issues with xfs_repair on disk %s again", source)
out, err := mounter.Exec.Command("xfs_repair", source).CombinedOutput()
if err != nil {
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out)
} else {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
}
}

// formatAndMount uses unix utils to format and mount the given disk
27149chen marked this conversation as resolved.
Show resolved Hide resolved
func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, fstype string, options []string) error {
readOnly := false
Expand Down Expand Up @@ -365,9 +427,9 @@ func (mounter *SafeFormatAndMount) formatAndMount(source string, target string,
var err error
switch existingFormat {
case "xfs":
err = mounter.checkAndRepairXfsFilesystem(source)
err = mounter.checkAndRepairXfsFilesystem(source, target)
default:
err = mounter.checkAndRepairFilesystem(source)
err = mounter.checkAndRepairFilesystem(source, target)
}

if err != nil {
Expand Down
66 changes: 65 additions & 1 deletion mount/safe_format_and_mount_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,76 @@ func TestSafeFormatAndMount(t *testing.T) {
expErrorType: UnknownMountError,
},
{
description: "Test that 'xfs_repair' is called twice but could not repair the filesystem",
description: "Test that 'xfs_repair' is called three times and repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called four times and repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called twice but exit with bad code and could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 10}},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called twice but exit with bad error and could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", fmt.Errorf("An error occurred")},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called three times and replay dirty logs",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called three times and replay dirty logs, but final 'xfs_repair' is failed",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called four times but could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
},
expErrorType: HasFilesystemErrors,
},
Expand Down