Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-89727: Fix os.walk to handle late editing of dirs #100703

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 111 additions & 49 deletions Lib/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
subdirectories (directories are generated bottom up).

When topdown is true, the caller can modify the dirnames list in-place
(e.g., via del or slice assignment), and walk will only recurse into the
(e.g., via del or slice assignment), and walk will only traverse into the
subdirectories whose names remain in dirnames; this can be used to prune the
search, or to impose a specific order of visiting. Modifying dirnames when
topdown is false has no effect on the behavior of os.walk(), since the
Expand Down Expand Up @@ -340,43 +340,35 @@ def walk(top, topdown=True, onerror=None, followlinks=False):

"""
sys.audit("os.walk", top, topdown, onerror, followlinks)
top = fspath(top)
if topdown:
return _walk_topdown(top, onerror, followlinks)
return _walk_bottomup(top, onerror, followlinks)

stack = [fspath(top)]
islink, join = path.islink, path.join
while stack:
top = stack.pop()
if isinstance(top, tuple):
yield top
continue
def _walk_topdown(top, onerror, followlinks):
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
carljm marked this conversation as resolved.
Show resolved Hide resolved
return

stack = []
islink, join = path.islink, path.join
while True:
dirs = []
nondirs = []
walk_dirs = []

# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains.
# We suppress the exception here, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit.
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
continue

cont = False
use_entry = True
with scandir_it:
while True:
try:
try:
entry = next(scandir_it)
except StopIteration:
break
entry = next(scandir_it)
except StopIteration:
break
except OSError as error:
if onerror is not None:
onerror(error)
cont = True
use_entry = False
break
carljm marked this conversation as resolved.
Show resolved Hide resolved

try:
Expand All @@ -391,7 +383,72 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
else:
nondirs.append(entry.name)

if not topdown and is_dir:
if use_entry:
# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Append dir names to walk along with top, their parent dir
stack.append([top, iter(dirs)])

# Set top and scandir_it for the next iteration
while stack:
root, dirname_iter = stack[-1]
for dirname in dirname_iter:
top = join(root, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the previous
# "yield"
if followlinks or not islink(top):
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
else:
break
else:
stack.pop()
continue
break
else:
return

def _walk_bottomup(top, onerror, followlinks):
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
return

stack = []
islink, join = path.islink, path.join
while True:
dirs = []
nondirs = []
walk_dirs = []
use_entry = True
with scandir_it:
while True:
try:
entry = next(scandir_it)
except StopIteration:
break
except OSError as error:
if onerror is not None:
onerror(error)
use_entry = False
break

try:
is_dir = entry.is_dir()
except OSError:
# If is_dir() raises an OSError, consider the entry not to
# be a directory, same behaviour as os.path.isdir().
is_dir = False

if is_dir:
dirs.append(entry.name)
# Bottom-up: traverse into sub-directory, but exclude
# symlinks to directories if followlinks is False
if followlinks:
Expand All @@ -408,27 +465,32 @@ def walk(top, topdown=True, onerror=None, followlinks=False):

if walk_into:
walk_dirs.append(entry.path)
if cont:
continue

if topdown:
# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Traverse into sub-directories
for dirname in reversed(dirs):
new_path = join(top, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
stack.append(new_path)
else:
nondirs.append(entry.name)
if use_entry:
# Traverse into sub-directories by appending a value
# (entry, dirs) where dirs will be walked and then
# when dirs is exhausted entry will be yielded
stack.append(((top, dirs, nondirs), iter(walk_dirs)))

# Set top and scandir_it for the next iteration
while stack:
entry, dirpath_iter = stack[-1]
for top in dirpath_iter:
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
else:
break
else:
stack.pop()
yield entry
continue
break
else:
# Yield after sub-directory traversal if going bottom up
stack.append((top, dirs, nondirs))
# Traverse into sub-directories
for new_path in reversed(walk_dirs):
stack.append(new_path)
return

__all__.append("walk")

Expand Down
57 changes: 57 additions & 0 deletions Lib/test/test_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,57 @@ def test_walk_above_recursion_limit(self):
self.assertEqual(sorted(dirs), ["SUB1", "SUB2", "d"])
self.assertEqual(all, expected)

def test_walk_late_dirs_modification(self):
extra_parts = [
("SUB3", "SUB31"),
("SUB4", "SUB41"),
("SUB5", "SUB51"),
]
for dir_parts in extra_parts:
os.makedirs(os.path.join(self.walk_path, *dir_parts))

all = self.walk(self.walk_path)
result = []
result.append(next(all))
dirs = result[0][1]
dirs.sort()

# SUB1
# SUB1/SUB11
result.append(next(all))
result.append(next(all))
# SUB2 is removed
dirs.pop(1)
# swap SUB3 and SUB4
dirs[1], dirs[2] = dirs[2], dirs[1]
# SUB4
# SUB4/SUB41
# SUB3
result.append(next(all))
result.append(next(all))
result.append(next(all))
# clear while between SUB3 and SUB3/SUB31
dirs.clear()
result.extend(all)

expected_parts = [
([], [], ["tmp1"]),
(["SUB1"], ["SUB11"], ["tmp2"]),
(["SUB1", "SUB11"], [], []),
# SUB2 skipped
(["SUB4"], ["SUB41"], []),
(["SUB4", "SUB41"], [], []),
(["SUB3"], ["SUB31"], []),
(["SUB3", "SUB31"], [], []),
# SUB5 skipped
# SUB5/SUB51 skipped
]
expected = [
(os.path.join(self.walk_path, *r), d, f)
for r, d, f in expected_parts
]
self.assertEqual(result, expected)


@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
class FwalkTests(WalkTests):
Expand Down Expand Up @@ -1603,6 +1654,10 @@ def walk(self, top, **kwargs):
bdirs[:] = list(map(os.fsencode, dirs))
bfiles[:] = list(map(os.fsencode, files))

# modifying bdirs in self.walk above prevents late modification
test_walk_late_dirs_modification = None


@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
class BytesFwalkTests(FwalkTests):
"""Tests for os.walk() with bytes."""
Expand All @@ -1615,6 +1670,8 @@ def fwalk(self, top='.', *args, **kwargs):
bdirs[:] = list(map(os.fsencode, dirs))
bfiles[:] = list(map(os.fsencode, files))

test_walk_late_dirs_modification = None


class MakedirTests(unittest.TestCase):
def setUp(self):
Expand Down