Skip to content

Commit

Permalink
pythongh-89727: Fix os.walk RecursionError on deep trees (python#99803)
Browse files Browse the repository at this point in the history
Use a stack to implement os.walk iteratively instead of recursively to
avoid hitting recursion limits on deeply nested trees.
  • Loading branch information
jonburdo authored Dec 19, 2022
1 parent 702a5bc commit 797edb2
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 83 deletions.
160 changes: 83 additions & 77 deletions Lib/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,89 +340,95 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
"""
sys.audit("os.walk", top, topdown, onerror, followlinks)
return _walk(fspath(top), topdown, onerror, followlinks)

def _walk(top, topdown, onerror, followlinks):
dirs = []
nondirs = []
walk_dirs = []

# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains. os.walk
# always suppressed the exception then, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit. That logic is copied here.
try:
# Note that scandir is global in this module due
# to earlier import-*.
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
return

with scandir_it:
while True:
try:
stack = [(False, fspath(top))]
islink, join = path.islink, path.join
while stack:
must_yield, top = stack.pop()
if must_yield:
yield top
continue

dirs = []
nondirs = []
walk_dirs = []

# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains.
# We suppress the exception here, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit.
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
continue

cont = False
with scandir_it:
while True:
try:
entry = next(scandir_it)
except StopIteration:
try:
entry = next(scandir_it)
except StopIteration:
break
except OSError as error:
if onerror is not None:
onerror(error)
cont = True
break
except OSError as error:
if onerror is not None:
onerror(error)
return

try:
is_dir = entry.is_dir()
except OSError:
# If is_dir() raises an OSError, consider that the entry is not
# a directory, same behaviour than os.path.isdir().
is_dir = False

if is_dir:
dirs.append(entry.name)
else:
nondirs.append(entry.name)
try:
is_dir = entry.is_dir()
except OSError:
# If is_dir() raises an OSError, consider the entry not to
# be a directory, same behaviour as os.path.isdir().
is_dir = False

if not topdown and is_dir:
# Bottom-up: recurse into sub-directory, but exclude symlinks to
# directories if followlinks is False
if followlinks:
walk_into = True
if is_dir:
dirs.append(entry.name)
else:
try:
is_symlink = entry.is_symlink()
except OSError:
# If is_symlink() raises an OSError, consider that the
# entry is not a symbolic link, same behaviour than
# os.path.islink().
is_symlink = False
walk_into = not is_symlink

if walk_into:
walk_dirs.append(entry.path)

# Yield before recursion if going top down
if topdown:
yield top, dirs, nondirs

# Recurse into sub-directories
islink, join = path.islink, path.join
for dirname in dirs:
new_path = join(top, dirname)
# Issue #23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
yield from _walk(new_path, topdown, onerror, followlinks)
else:
# Recurse into sub-directories
for new_path in walk_dirs:
yield from _walk(new_path, topdown, onerror, followlinks)
# Yield after recursion if going bottom up
yield top, dirs, nondirs
nondirs.append(entry.name)

if not topdown and is_dir:
# Bottom-up: traverse into sub-directory, but exclude
# symlinks to directories if followlinks is False
if followlinks:
walk_into = True
else:
try:
is_symlink = entry.is_symlink()
except OSError:
# If is_symlink() raises an OSError, consider the
# entry not to be a symbolic link, same behaviour
# as os.path.islink().
is_symlink = False
walk_into = not is_symlink

if walk_into:
walk_dirs.append(entry.path)
if cont:
continue

if topdown:
# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Traverse into sub-directories
for dirname in reversed(dirs):
new_path = join(top, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
stack.append((False, new_path))
else:
# Yield after sub-directory traversal if going bottom up
stack.append((True, (top, dirs, nondirs)))
# Traverse into sub-directories
for new_path in reversed(walk_dirs):
stack.append((False, new_path))

__all__.append("walk")

Expand Down
16 changes: 10 additions & 6 deletions Lib/test/support/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2178,19 +2178,23 @@ def check_disallow_instantiation(testcase, tp, *args, **kwds):
testcase.assertRaisesRegex(TypeError, msg, tp, *args, **kwds)

@contextlib.contextmanager
def set_recursion_limit(limit):
"""Temporarily change the recursion limit."""
original_limit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(limit)
yield
finally:
sys.setrecursionlimit(original_limit)

def infinite_recursion(max_depth=75):
"""Set a lower limit for tests that interact with infinite recursions
(e.g test_ast.ASTHelpers_Test.test_recursion_direct) since on some
debug windows builds, due to not enough functions being inlined the
stack size might not handle the default recursion limit (1000). See
bpo-11105 for details."""
return set_recursion_limit(max_depth)

original_depth = sys.getrecursionlimit()
try:
sys.setrecursionlimit(max_depth)
yield
finally:
sys.setrecursionlimit(original_depth)

def ignore_deprecations_from(module: str, *, like: str) -> object:
token = object()
Expand Down
43 changes: 43 additions & 0 deletions Lib/test/test_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from test.support import import_helper
from test.support import os_helper
from test.support import socket_helper
from test.support import set_recursion_limit
from test.support import warnings_helper
from platform import win32_is_iot

Expand Down Expand Up @@ -1471,6 +1472,46 @@ def test_walk_many_open_files(self):
self.assertEqual(next(it), expected)
p = os.path.join(p, 'd')

def test_walk_above_recursion_limit(self):
depth = 50
os.makedirs(os.path.join(self.walk_path, *(['d'] * depth)))
with set_recursion_limit(depth - 5):
all = list(self.walk(self.walk_path))

sub2_path = self.sub2_tree[0]
for root, dirs, files in all:
if root == sub2_path:
dirs.sort()
files.sort()

d_entries = []
d_path = self.walk_path
for _ in range(depth):
d_path = os.path.join(d_path, "d")
d_entries.append((d_path, ["d"], []))
d_entries[-1][1].clear()

# Sub-sequences where the order is known
sections = {
"SUB1": [
(self.sub1_path, ["SUB11"], ["tmp2"]),
(self.sub11_path, [], []),
],
"SUB2": [self.sub2_tree],
"d": d_entries,
}

# The ordering of sub-dirs is arbitrary but determines the order in
# which sub-sequences appear
dirs = all[0][1]
expected = [(self.walk_path, dirs, ["tmp1"])]
for d in dirs:
expected.extend(sections[d])

self.assertEqual(len(all), depth + 4)
self.assertEqual(sorted(dirs), ["SUB1", "SUB2", "d"])
self.assertEqual(all, expected)


@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
class FwalkTests(WalkTests):
Expand Down Expand Up @@ -1545,6 +1586,8 @@ def test_fd_leak(self):

# fwalk() keeps file descriptors open
test_walk_many_open_files = None
# fwalk() still uses recursion
test_walk_above_recursion_limit = None


class BytesWalkTests(WalkTests):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix issue with :func:`os.walk` where a :exc:`RecursionError` would occur on
deep directory structures by adjusting the implementation of
:func:`os.walk` to be iterative instead of recursive.

0 comments on commit 797edb2

Please sign in to comment.