Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-89727: Fix os.walk to handle late editing of dirs #100703

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 66 additions & 36 deletions Lib/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
subdirectories (directories are generated bottom up).

When topdown is true, the caller can modify the dirnames list in-place
(e.g., via del or slice assignment), and walk will only recurse into the
(e.g., via del or slice assignment), and walk will only traverse into the
subdirectories whose names remain in dirnames; this can be used to prune the
search, or to impose a specific order of visiting. Modifying dirnames when
topdown is false has no effect on the behavior of os.walk(), since the
Expand Down Expand Up @@ -341,31 +341,27 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
"""
sys.audit("os.walk", top, topdown, onerror, followlinks)

stack = [fspath(top)]
islink, join = path.islink, path.join
while stack:
top = stack.pop()
if isinstance(top, tuple):
yield top
continue
top = fspath(top)
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
carljm marked this conversation as resolved.
Show resolved Hide resolved
return

stack = []
islink, join = path.islink, path.join
while True:
dirs = []
nondirs = []
walk_dirs = []
use_entry = True

# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains.
# We suppress the exception here, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment belongs with the try: scandir block, which is moved from here and now exists in three separate locations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. It can probably just go on the first instance of this block

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually in the latest commit where this block appears in separate functions, I decided to just remove this comment. The behavior is described in the docstring and I think the try-except block is pretty clear. Happy to add it or part of it back if deemed necessary

try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
continue

cont = False
with scandir_it:
while True:
try:
Expand All @@ -376,7 +372,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
except OSError as error:
if onerror is not None:
onerror(error)
cont = True
use_entry = False
break

try:
Expand Down Expand Up @@ -408,27 +404,61 @@ def walk(top, topdown=True, onerror=None, followlinks=False):

if walk_into:
walk_dirs.append(entry.path)
if cont:
continue

if topdown:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such a high percentage of code in this function is now under either if topdown or the reverse that I'm kind of curious what it would look like to just have separate (internal) functions for topdown vs bottom-up. But this would still increase code duplication significantly, and move the duplicated code/structure further apart, so I suspect it's still better this way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually thinking about this. I just gave it a shot to see. This seems to me like one of those cases where there are enough little differences in logic that it's much cleaner to separate the two. If we didn't support dir modification for topdown or didn't care about performance it might be simpler, but when you have a lot of the same logic interspersed by a lot of little differences it gets messy. In these kinds of situations I also tend to prefer some duplication between functions over one big function with conditions inside of loops.

I also find it easier to look at the two sets of logic separately, but either way works.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think the split version seems fine. Will wait a bit and see if some core devs have opinions.

# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Traverse into sub-directories
for dirname in reversed(dirs):
new_path = join(top, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
stack.append(new_path)
if use_entry:
# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Append dir names to walk along with top, their parent dir
stack.append([top, iter(dirs)])

# Set top and scandir_it for the next iteration
while stack:
root, dirs = stack[-1]
for dirname in dirs:
top = join(root, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the previous
# "yield"
if followlinks or not islink(top):
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
else:
break
else:
stack.pop()
continue
break
else:
return
else:
# Yield after sub-directory traversal if going bottom up
stack.append((top, dirs, nondirs))
# Traverse into sub-directories
for new_path in reversed(walk_dirs):
stack.append(new_path)
if use_entry:
# Traverse into sub-directories by appending a value
# (entry, dirs) where dirs will be walked and then
# when dirs is exhausted entry will be yielded
stack.append(((top, dirs, nondirs), iter(walk_dirs)))

# Set top and scandir_it for the next iteration
while stack:
entry, dirs = stack[-1]
for top in dirs:
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
else:
break
else:
stack.pop()
yield entry
continue
break
else:
return

__all__.append("walk")

Expand Down
57 changes: 57 additions & 0 deletions Lib/test/test_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,57 @@ def test_walk_above_recursion_limit(self):
self.assertEqual(sorted(dirs), ["SUB1", "SUB2", "d"])
self.assertEqual(all, expected)

def test_walk_late_dirs_modification(self):
extra_parts = [
("SUB3", "SUB31"),
("SUB4", "SUB41"),
("SUB5", "SUB51"),
]
for dir_parts in extra_parts:
os.makedirs(os.path.join(self.walk_path, *dir_parts))

all = self.walk(self.walk_path)
result = []
result.append(next(all))
dirs = result[0][1]
dirs.sort()

# SUB1
# SUB1/SUB11
result.append(next(all))
result.append(next(all))
# SUB2 is removed
dirs.pop(1)
# swap SUB3 and SUB4
dirs[1], dirs[2] = dirs[2], dirs[1]
# SUB4
# SUB4/SUB41
# SUB3
result.append(next(all))
result.append(next(all))
result.append(next(all))
# clear while between SUB3 and SUB3/SUB31
dirs.clear()
result.extend(all)

expected_parts = [
([], [], ["tmp1"]),
(["SUB1"], ["SUB11"], ["tmp2"]),
(["SUB1", "SUB11"], [], []),
# SUB2 skipped
(["SUB4"], ["SUB41"], []),
(["SUB4", "SUB41"], [], []),
(["SUB3"], ["SUB31"], []),
(["SUB3", "SUB31"], [], []),
# SUB5 skipped
# SUB5/SUB51 skipped
]
expected = [
(os.path.join(self.walk_path, *r), d, f)
for r, d, f in expected_parts
]
self.assertEqual(result, expected)


@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
class FwalkTests(WalkTests):
Expand Down Expand Up @@ -1603,6 +1654,10 @@ def walk(self, top, **kwargs):
bdirs[:] = list(map(os.fsencode, dirs))
bfiles[:] = list(map(os.fsencode, files))

# modifying bdirs in self.walk above prevents late modification
test_walk_late_dirs_modification = None


@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
class BytesFwalkTests(FwalkTests):
"""Tests for os.walk() with bytes."""
Expand All @@ -1615,6 +1670,8 @@ def fwalk(self, top='.', *args, **kwargs):
bdirs[:] = list(map(os.fsencode, dirs))
bfiles[:] = list(map(os.fsencode, files))

test_walk_late_dirs_modification = None


class MakedirTests(unittest.TestCase):
def setUp(self):
Expand Down