diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index ee3330f44f47d0..67ef36890d5739 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties: >>> PurePath('a/b.py').match('/*.py') False + The *pattern* may be another path object; this speeds up matching the same + pattern against multiple files:: + + >>> pattern = PurePath('*.py') + >>> PurePath('a/b.py').match(pattern) + True + As with other methods, case-sensitivity follows platform defaults:: >>> PurePosixPath('b.py').match('*.PY') @@ -581,6 +588,10 @@ Pure paths provide the following methods and properties: .. versionadded:: 3.12 The *case_sensitive* argument. + .. versionchanged:: 3.13 + Support for the recursive wildcard "``**``" was added. In previous + versions, it acted like the non-recursive wildcard "``*``". + .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 8c81ac76a56b46..44c0915492dcc0 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -90,6 +90,9 @@ Improved Modules pathlib ------- +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`73435`.) + * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`. (Contributed by Barney Gale in :gh:`77609`.) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index a57b582a211e06..62406473b66e4f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -61,6 +62,22 @@ def _is_case_sensitive(flavour): # Globbing helpers # + +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive): return re.compile(fnmatch.translate(pat), flags).match +@functools.lru_cache() +def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with + its path separators and newlines swapped (e.g. '**\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): + if part == '**\n': + # '**/' component: we use '[\s\S]' rather than '.' so that path + # separators (i.e. newlines) are matched. The trailing '^' ensures + # we terminate after a path separator (i.e. on a new line). + part = r'[\s\S]*^' + elif part == '**': + # '**' component. + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + # Any other component: pass to fnmatch.translate(). We slice off + # the common prefix and suffix added by translate() to ensure that + # re.DOTALL is not set, and the end of the string not matched, + # respectively. With DOTALL not set, '*' wildcards will not match + # path separators, because the '.' characters in the pattern will + # not match newlines. + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + # Match the end of the path, always. + parts.append(r'\Z') + flags = re.MULTILINE + if not case_sensitive: + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + class _Selector: """A selector matches a specific glob pattern part against the children of a given path.""" @@ -276,6 +338,10 @@ class PurePath: # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_cached` slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. + '_lines_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -441,6 +507,16 @@ def _parts_normcase(self): self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached + @property + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. + try: + return self._lines_cached + except AttributeError: + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_cached = str(self).translate(trans) + return self._lines_cached + def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None): """ Return True if this path matches the given pattern. """ + if not isinstance(path_pattern, PurePath): + path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self._flavour) - pat = self.with_segments(path_pattern) - if not pat.parts: + pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: raise ValueError("empty pattern") - pat_parts = pat.parts - parts = self.parts - if pat.drive or pat.root: - if len(pat_parts) != len(parts): - return False - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - match = _compile_pattern(pat, case_sensitive) - if not match(part): - return False - return True + # Subclassing os.PathLike makes isinstance() checks slower, # which in turn makes Path construction slower. Register instead! diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 4391d685d3c126..076ace3d930857 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -310,8 +310,30 @@ def test_match_common(self): self.assertFalse(P('/ab.py').match('/a/*.py')) self.assertFalse(P('/a/b/c.py').match('/a/*.py')) # Multi-part glob-style pattern. - self.assertFalse(P('/a/b/c.py').match('/**/*.py')) + self.assertTrue(P('a').match('**')) + self.assertTrue(P('c.py').match('**')) + self.assertTrue(P('a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('/**')) + self.assertTrue(P('/a/b/c.py').match('**/')) + self.assertTrue(P('/a/b/c.py').match('/a/**')) + self.assertTrue(P('/a/b/c.py').match('**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py')) + self.assertFalse(P('c.py').match('**/a.py')) + self.assertFalse(P('c.py').match('c/**')) + self.assertFalse(P('a/b/c.py').match('**/a')) + self.assertFalse(P('a/b/c.py').match('**/a/b')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**')) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py')) + self.assertRaises(ValueError, P('a').match, '**a/b/c') + self.assertRaises(ValueError, P('a').match, 'a/b/c**') # Case-sensitive flag self.assertFalse(P('A.py').match('a.PY', case_sensitive=True)) self.assertTrue(P('A.py').match('a.PY', case_sensitive=False)) diff --git a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst new file mode 100644 index 00000000000000..d5a2ae07700b34 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst @@ -0,0 +1 @@ +Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.