Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the workflow of xfs devices filesystem check and mount #132

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 64 additions & 10 deletions mount/mount_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ const (
fsckErrorsCorrected = 1
// 'fsck' found errors but exited without correcting them
fsckErrorsUncorrected = 4
// 'xfs_repair' found errors but exited without correcting them
xfsRepairErrorsUncorrected = 1
// 'xfs_repair' was unable to proceed due to a dirty log
xfsRepairErrorsDirtyLogs = 2
// Retry 'xfs_repair' three times on failure to make more repairs on successive runs.
xfsRepairRetries = 3
)

// Mounter provides the default implementation of mount.Interface
Expand Down Expand Up @@ -257,7 +263,7 @@ func (mounter *Mounter) GetMountRefs(pathname string) ([]string, error) {
}

// checkAndRepairFileSystem checks and repairs filesystems using command fsck.
func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error {
func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source, target string) error {
klog.V(4).Infof("Checking for issues with fsck on disk: %s", source)
args := []string{"-a", source}
out, err := mounter.Exec.Command("fsck", args...).CombinedOutput()
Expand All @@ -278,7 +284,7 @@ func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error
}

// checkAndRepairXfsFilesystem checks and repairs xfs filesystem using command xfs_repair.
func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) error {
func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source, target string) error {
klog.V(4).Infof("Checking for issues with xfs_repair on disk: %s", source)

args := []string{source}
Expand All @@ -292,18 +298,66 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er
return nil
} else {
klog.Warningf("Filesystem corruption was detected for %s, running xfs_repair to repair", source)
out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput()
if err != nil {
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out)
} else {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
var uncorrectedErr error

// If xfs_repair fails to repair the file system successfully, try giving the same xfs_repair
// command twice more; xfs_repair may be able to make more repairs on successive runs.
for i := 0; i < xfsRepairRetries; i++ {
27149chen marked this conversation as resolved.
Show resolved Hide resolved
out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput()
if err == nil {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
27149chen marked this conversation as resolved.
Show resolved Hide resolved
}
e, isExitError := err.(utilexec.ExitError)
if !isExitError {
return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err)

}
switch e.ExitStatus() {
case xfsRepairErrorsUncorrected:
uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out))
// If the exit status is 2, do not retry, replay the dirty logs instead.
case xfsRepairErrorsDirtyLogs:
return mounter.replayXfsDirtyLogs(source, target)
default:
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out))
}
}
if uncorrectedErr != nil {
return uncorrectedErr
}
}
}
return nil
}

// replayXfsDirtyLogs tries to replay dirty logs by by mounting and
// immediately unmounting the filesystem on the same class of machine
// that crashed.
func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source, target string) error {
27149chen marked this conversation as resolved.
Show resolved Hide resolved
klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source)
msg := fmt.Sprintf("failed to replay dirty logs on device %s", source)

klog.V(4).Infof("Attempting to mount disk %s at %s", source, target)
if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil {
Copy link
Member

@gnufied gnufied Feb 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use target field given in the mount function as mount location rather than tempdir? Could a kubelet/driver crash leave the volume mounted in tempdir and never get cleaned up?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think maybe the target is not existing or not available for mounting for some potential reasons. tempdir is more safer to temporarily mount and unmount, and we will make sure it is deleted in the end.
But you are right, it is a risk if the caller crashes between mount and unmount.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think again, it is the caller of formatAndMount's responsibility to make sure the existence and availablity of the target, so I think we can use the target field here.

return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err)
}

klog.V(4).Infof("Unmounting %s", target)
if err := mounter.Interface.Unmount(target); err != nil {
return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err)
}

klog.V(4).Infof("Checking for issues with xfs_repair on disk %s again", source)
out, err := mounter.Exec.Command("xfs_repair", source).CombinedOutput()
if err != nil {
return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out)
} else {
klog.Infof("Device %s has errors which were corrected by xfs_repair.", source)
return nil
}
}

// formatAndMount uses unix utils to format and mount the given disk
27149chen marked this conversation as resolved.
Show resolved Hide resolved
func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, fstype string, options []string) error {
readOnly := false
Expand Down Expand Up @@ -365,9 +419,9 @@ func (mounter *SafeFormatAndMount) formatAndMount(source string, target string,
var err error
switch existingFormat {
case "xfs":
err = mounter.checkAndRepairXfsFilesystem(source)
err = mounter.checkAndRepairXfsFilesystem(source, target)
default:
err = mounter.checkAndRepairFilesystem(source)
err = mounter.checkAndRepairFilesystem(source, target)
}

if err != nil {
Expand Down
66 changes: 65 additions & 1 deletion mount/safe_format_and_mount_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,76 @@ func TestSafeFormatAndMount(t *testing.T) {
expErrorType: UnknownMountError,
},
{
description: "Test that 'xfs_repair' is called twice but could not repair the filesystem",
description: "Test that 'xfs_repair' is called three times and repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called four times and repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called twice but exit with bad code and could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 10}},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called twice but exit with bad error and could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", fmt.Errorf("An error occurred")},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called three times and replay dirty logs",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}},
{"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil},
},
},
{
description: "Test that 'xfs_repair' is called three times and replay dirty logs, but final 'xfs_repair' is failed",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
},
expErrorType: HasFilesystemErrors,
},
{
description: "Test that 'xfs_repair' is called four times but could not repair the filesystem",
fstype: "xfs",
execScripts: []ExecArgs{
{"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil},
{"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
{"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}},
},
expErrorType: HasFilesystemErrors,
},
Expand Down