From a2efda96d482434c5be8cc4f3c9bcffd4bc270fc Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Wed, 4 Oct 2023 13:05:37 +0200 Subject: [PATCH] inspect: ignore ENOENT during device lookup When we walk the /dev tree we need to lookup all device paths. Now in order to get the major and minor version we have to actually stat each device. This can again fail of course. There is at least a race between the readdir at stat call so it must ignore ENOENT errors to avoid the race condition as this is not a user problem. Second, we should also not return other errors and just log them instead, returning an error means stopping the walk and returning early which means inspect fails with an error which would be bad. Also there seems to be cases were ENOENT will be returned all the time, e.g. when a device is forcefully removed. In the reported bug this is triggered with iSCSI devices. Because the caller does already lookup the device from the created map it reports a warning there if the device is missing on the host so it is not a problem to ignore a error during lookup here. [NO NEW TESTS NEEDED] Requires special device setup to trigger consistentlyand we cannot do that in CI. Fixes https://issues.redhat.com/browse/RHEL-11158 Signed-off-by: Paul Holzinger --- pkg/util/utils_linux.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/util/utils_linux.go b/pkg/util/utils_linux.go index c094beac75..3cb080481b 100644 --- a/pkg/util/utils_linux.go +++ b/pkg/util/utils_linux.go @@ -48,7 +48,16 @@ func FindDeviceNodes() (map[string]string, error) { info, err := d.Info() if err != nil { - return err + // Info() can return ErrNotExist if the file was deleted between the readdir and stat call. + // This race can happen and is no reason to log an ugly error. If this is a container device + // that is used the code later will print a proper error in such case. + // There also seem to be cases were ErrNotExist is always returned likely due a weird device + // state, e.g. removing a device forcefully. This can happen with iSCSI devices. + if !errors.Is(err, fs.ErrNotExist) { + logrus.Errorf("Failed to get device information for %s: %v", path, err) + } + // return nil here as we want to continue looking for more device and not stop the WalkDir() + return nil } // We are a device node. Get major/minor. sysstat, ok := info.Sys().(*syscall.Stat_t)