Skip to content

Commit

Permalink
pythonGH-73435: Implement recursive wildcards in `pathlib.PurePath.ma…
Browse files Browse the repository at this point in the history
…tch()` (python#101398)

`PurePath.match()` now handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments.

We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`.

Co-authored-by: Hugo van Kemenade <[email protected]>
Co-authored-by: Alex Waygood <[email protected]>
  • Loading branch information
3 people authored May 30, 2023
1 parent 4c77061 commit 49f90ba
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 15 deletions.
11 changes: 11 additions & 0 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
>>> PurePath('a/b.py').match('/*.py')
False

The *pattern* may be another path object; this speeds up matching the same
pattern against multiple files::

>>> pattern = PurePath('*.py')
>>> PurePath('a/b.py').match(pattern)
True

As with other methods, case-sensitivity follows platform defaults::

>>> PurePosixPath('b.py').match('*.PY')
Expand All @@ -581,6 +588,10 @@ Pure paths provide the following methods and properties:
.. versionadded:: 3.12
The *case_sensitive* argument.

.. versionchanged:: 3.13
Support for the recursive wildcard "``**``" was added. In previous
versions, it acted like the non-recursive wildcard "``*``".


.. method:: PurePath.relative_to(other, walk_up=False)

Expand Down
3 changes: 3 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ Improved Modules
pathlib
-------

* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
(Contributed by Barney Gale in :gh:`73435`.)

* Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and
:meth:`~pathlib.Path.rglob`.
(Contributed by Barney Gale in :gh:`77609`.)
Expand Down
99 changes: 85 additions & 14 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,30 @@ def _ignore_error(exception):
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)


@functools.cache
def _is_case_sensitive(flavour):
return flavour.normcase('Aa') == 'Aa'

#
# Globbing helpers
#


# fnmatch.translate() returns a regular expression that includes a prefix and
# a suffix, which enable matching newlines and ensure the end of the string is
# matched, respectively. These features are undesirable for our implementation
# of PurePatch.match(), which represents path separators as newlines and joins
# pattern segments together. As a workaround, we define a slice object that
# can remove the prefix and suffix from any translate() result. See the
# _compile_pattern_lines() function for more details.
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
_SWAP_SEP_AND_NEWLINE = {
'/': str.maketrans({'/': '\n', '\n': '/'}),
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
}


@functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0]
Expand Down Expand Up @@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive):
return re.compile(fnmatch.translate(pat), flags).match


@functools.lru_cache()
def _compile_pattern_lines(pattern_lines, case_sensitive):
"""Compile the given pattern lines to an `re.Pattern` object.
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
its path separators and newlines swapped (e.g. '**\n*.py`). By using
newlines to separate path components, and not setting `re.DOTALL`, we
ensure that the `*` wildcard cannot match path separators.
The returned `re.Pattern` object may have its `match()` method called to
match a complete pattern, or `search()` to match from the right. The
argument supplied to these methods must also have its path separators and
newlines swapped.
"""

# Match the start of the path, or just after a path separator
parts = ['^']
for part in pattern_lines.splitlines(keepends=True):
if part == '**\n':
# '**/' component: we use '[\s\S]' rather than '.' so that path
# separators (i.e. newlines) are matched. The trailing '^' ensures
# we terminate after a path separator (i.e. on a new line).
part = r'[\s\S]*^'
elif part == '**':
# '**' component.
part = r'[\s\S]*'
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
# Any other component: pass to fnmatch.translate(). We slice off
# the common prefix and suffix added by translate() to ensure that
# re.DOTALL is not set, and the end of the string not matched,
# respectively. With DOTALL not set, '*' wildcards will not match
# path separators, because the '.' characters in the pattern will
# not match newlines.
part = fnmatch.translate(part)[_FNMATCH_SLICE]
parts.append(part)
# Match the end of the path, always.
parts.append(r'\Z')
flags = re.MULTILINE
if not case_sensitive:
flags |= re.IGNORECASE
return re.compile(''.join(parts), flags=flags)


class _Selector:
"""A selector matches a specific glob pattern part against the children
of a given path."""
Expand Down Expand Up @@ -276,6 +338,10 @@ class PurePath:
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',

# The `_lines_cached` slot stores the string path with path separators
# and newlines swapped. This is used to implement `match()`.
'_lines_cached',

# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
Expand Down Expand Up @@ -441,6 +507,16 @@ def _parts_normcase(self):
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached

@property
def _lines(self):
# Path with separators and newlines swapped, for pattern matching.
try:
return self._lines_cached
except AttributeError:
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
self._lines_cached = str(self).translate(trans)
return self._lines_cached

def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
Expand Down Expand Up @@ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
"""
if not isinstance(path_pattern, PurePath):
path_pattern = self.with_segments(path_pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self._flavour)
pat = self.with_segments(path_pattern)
if not pat.parts:
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
if path_pattern.drive or path_pattern.root:
return pattern.match(self._lines) is not None
elif path_pattern._tail:
return pattern.search(self._lines) is not None
else:
raise ValueError("empty pattern")
pat_parts = pat.parts
parts = self.parts
if pat.drive or pat.root:
if len(pat_parts) != len(parts):
return False
elif len(pat_parts) > len(parts):
return False
for part, pat in zip(reversed(parts), reversed(pat_parts)):
match = _compile_pattern(pat, case_sensitive)
if not match(part):
return False
return True


# Subclassing os.PathLike makes isinstance() checks slower,
# which in turn makes Path construction slower. Register instead!
Expand Down
24 changes: 23 additions & 1 deletion Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,30 @@ def test_match_common(self):
self.assertFalse(P('/ab.py').match('/a/*.py'))
self.assertFalse(P('/a/b/c.py').match('/a/*.py'))
# Multi-part glob-style pattern.
self.assertFalse(P('/a/b/c.py').match('/**/*.py'))
self.assertTrue(P('a').match('**'))
self.assertTrue(P('c.py').match('**'))
self.assertTrue(P('a/b/c.py').match('**'))
self.assertTrue(P('/a/b/c.py').match('**'))
self.assertTrue(P('/a/b/c.py').match('/**'))
self.assertTrue(P('/a/b/c.py').match('**/'))
self.assertTrue(P('/a/b/c.py').match('/a/**'))
self.assertTrue(P('/a/b/c.py').match('**/*.py'))
self.assertTrue(P('/a/b/c.py').match('/**/*.py'))
self.assertTrue(P('/a/b/c.py').match('/a/**/*.py'))
self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py'))
self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py'))
self.assertFalse(P('c.py').match('**/a.py'))
self.assertFalse(P('c.py').match('c/**'))
self.assertFalse(P('a/b/c.py').match('**/a'))
self.assertFalse(P('a/b/c.py').match('**/a/b'))
self.assertFalse(P('a/b/c.py').match('**/a/b/c'))
self.assertFalse(P('a/b/c.py').match('**/a/b/c.'))
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**'))
self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py'))
self.assertRaises(ValueError, P('a').match, '**a/b/c')
self.assertRaises(ValueError, P('a').match, 'a/b/c**')
# Case-sensitive flag
self.assertFalse(P('A.py').match('a.PY', case_sensitive=True))
self.assertTrue(P('A.py').match('a.PY', case_sensitive=False))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.

0 comments on commit 49f90ba

Please sign in to comment.