Skip to content

Commit

Permalink
GH-117586: Speed up pathlib.Path.glob() by working with strings (#1…
Browse files Browse the repository at this point in the history
…17589)

Move pathlib globbing implementation into a new private class: `glob._Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects.

In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`.

This sets the stage for two more improvements:

- GH-115060: Query non-wildcard segments with `lstat()`
- GH-116380: Unify `pathlib` and `glob` implementations of globbing.

No change to the implementations of `glob.glob()` and `glob.iglob()`.
  • Loading branch information
barneygale authored Apr 10, 2024
1 parent 689ada7 commit 6258844
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 195 deletions.
186 changes: 186 additions & 0 deletions Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import os
import re
import fnmatch
import functools
import itertools
import operator
import stat
import sys

Expand Down Expand Up @@ -256,7 +258,9 @@ def escape(pathname):
return drive + pathname


_special_parts = ('', '.', '..')
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
_no_recurse_symlinks = object()


def translate(pat, *, recursive=False, include_hidden=False, seps=None):
Expand Down Expand Up @@ -312,3 +316,185 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
results.append(any_sep)
res = ''.join(results)
return fr'(?s:{res})\Z'


@functools.lru_cache(maxsize=512)
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
return re.compile(regex, flags=flags).match


class _Globber:
"""Class providing shell-style pattern matching and globbing.
"""

def __init__(self, sep, case_sensitive, recursive=False):
self.sep = sep
self.case_sensitive = case_sensitive
self.recursive = recursive

# Low-level methods

lstat = staticmethod(os.lstat)
scandir = staticmethod(os.scandir)
parse_entry = operator.attrgetter('path')
concat_path = operator.add

if os.name == 'nt':
@staticmethod
def add_slash(pathname):
tail = os.path.splitroot(pathname)[2]
if not tail or tail[-1] in '\\/':
return pathname
return f'{pathname}\\'
else:
@staticmethod
def add_slash(pathname):
if not pathname or pathname[-1] == '/':
return pathname
return f'{pathname}/'

# High-level methods

def compile(self, pat):
return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)

def selector(self, parts):
"""Returns a function that selects from a given path, walking and
filtering according to the glob-style pattern parts in *parts*.
"""
if not parts:
return self.select_exists
part = parts.pop()
if self.recursive and part == '**':
selector = self.recursive_selector
elif part in _special_parts:
selector = self.special_selector
else:
selector = self.wildcard_selector
return selector(part, parts)

def special_selector(self, part, parts):
"""Returns a function that selects special children of the given path.
"""
select_next = self.selector(parts)

def select_special(path, exists=False):
path = self.concat_path(self.add_slash(path), part)
return select_next(path, exists)
return select_special

def wildcard_selector(self, part, parts):
"""Returns a function that selects direct children of a given path,
filtering by pattern.
"""

match = None if part == '*' else self.compile(part)
dir_only = bool(parts)
if dir_only:
select_next = self.selector(parts)

def select_wildcard(path, exists=False):
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
if match is None or match(entry.name):
if dir_only:
try:
if not entry.is_dir():
continue
except OSError:
continue
entry_path = self.parse_entry(entry)
if dir_only:
yield from select_next(entry_path, exists=True)
else:
yield entry_path
return select_wildcard

def recursive_selector(self, part, parts):
"""Returns a function that selects a given path and all its children,
recursively, filtering by pattern.
"""
# Optimization: consume following '**' parts, which have no effect.
while parts and parts[-1] == '**':
parts.pop()

# Optimization: consume and join any following non-special parts here,
# rather than leaving them for the next selector. They're used to
# build a regular expression, which we use to filter the results of
# the recursive walk. As a result, non-special pattern segments
# following a '**' wildcard don't require additional filesystem access
# to expand.
follow_symlinks = self.recursive is not _no_recurse_symlinks
if follow_symlinks:
while parts and parts[-1] not in _special_parts:
part += self.sep + parts.pop()

match = None if part == '**' else self.compile(part)
dir_only = bool(parts)
select_next = self.selector(parts)

def select_recursive(path, exists=False):
path = self.add_slash(path)
match_pos = len(str(path))
if match is None or match(str(path), match_pos):
yield from select_next(path, exists)
stack = [path]
while stack:
yield from select_recursive_step(stack, match_pos)

def select_recursive_step(stack, match_pos):
path = stack.pop()
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
is_dir = False
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
is_dir = True
except OSError:
pass

if is_dir or not dir_only:
entry_path = self.parse_entry(entry)
if match is None or match(str(entry_path), match_pos):
if dir_only:
yield from select_next(entry_path, exists=True)
else:
# Optimization: directly yield the path if this is
# last pattern part.
yield entry_path
if is_dir:
stack.append(entry_path)

return select_recursive

def select_exists(self, path, exists=False):
"""Yields the given path, if it exists.
"""
if exists:
# Optimization: this path is already known to exist, e.g. because
# it was returned from os.scandir(), so we skip calling lstat().
yield path
else:
try:
self.lstat(path)
yield path
except OSError:
pass
77 changes: 47 additions & 30 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
operating systems.
"""

import glob
import io
import ntpath
import operator
import os
import posixpath
import sys
Expand Down Expand Up @@ -111,6 +113,7 @@ class PurePath(_abc.PurePathBase):
'_hash',
)
parser = os.path
_globber = glob._Globber

def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing
Expand Down Expand Up @@ -253,14 +256,17 @@ def _format_parsed_parts(cls, drv, root, tail):
return cls.parser.sep.join(tail)

def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail))
path._drv = drv
path._root = root
path._tail_cached = tail
return path

def _from_parsed_string(self, path_str):
path = self.with_segments(path_str)
path._str = path_str or '.'
return path

@classmethod
def _parse_path(cls, path):
if not path:
Expand Down Expand Up @@ -453,21 +459,6 @@ def as_uri(self):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
parts.reverse()
return parts

@property
def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching."""
Expand Down Expand Up @@ -576,6 +567,17 @@ def write_text(self, data, encoding=None, errors=None, newline=None):
encoding = io.text_encoding(encoding)
return _abc.PathBase.write_text(self, data, encoding, errors, newline)

_remove_leading_dot = operator.itemgetter(slice(2, None))
_remove_trailing_slash = operator.itemgetter(slice(-1))

def _filter_trailing_slash(self, paths):
sep = self.parser.sep
anchor_len = len(self.anchor)
for path_str in paths:
if len(path_str) > anchor_len and path_str[-1] == sep:
path_str = path_str[:-1]
yield path_str

def iterdir(self):
"""Yield path objects of the directory contents.
Expand All @@ -587,13 +589,9 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
return entry.name if str(self) == '.' else entry.path

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = self._direntry_str(entry)
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
Expand Down Expand Up @@ -626,8 +624,30 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
sys.audit("pathlib.Path.glob", self, pattern)
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
if pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
parts = pattern._tail.copy()
if not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
raw = pattern._raw_path
if raw[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
if not self.is_dir():
return iter([])
select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks)
root = str(self)
paths = select(root, exists=True)

# Normalize results
if root == '.':
paths = map(self._remove_leading_dot, paths)
if parts[-1] == '':
paths = map(self._remove_trailing_slash, paths)
elif parts[-1] == '**':
paths = self._filter_trailing_slash(paths)
paths = map(self._from_parsed_string, paths)
return paths

def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
"""Recursively yield all existing files (of any kind, including
Expand All @@ -638,8 +658,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
Expand Down Expand Up @@ -669,9 +688,7 @@ def absolute(self):
# of joining, and we exploit the fact that getcwd() returns a
# fully-normalized string by storing it in _str. This is used to
# implement Path.cwd().
result = self.with_segments(cwd)
result._str = cwd
return result
return self._from_parsed_string(cwd)
drive, root, rel = os.path.splitroot(cwd)
if not rel:
return self._from_parsed_parts(drive, root, self._tail)
Expand Down
Loading

0 comments on commit 6258844

Please sign in to comment.