diff --git a/datalad_next/gitpathspec/__init__.py b/datalad_next/gitpathspec/__init__.py new file mode 100644 index 00000000..755dd8ef --- /dev/null +++ b/datalad_next/gitpathspec/__init__.py @@ -0,0 +1,13 @@ +"""Data class for Git's pathspecs with subdirectory mangling support + +The main purpose of this functionality is to be able to take a pathspecs that +is valid in the context of a top-level repository, and translate it such that +the set of pathspecs given to the same command running on/in a +submodule/subdirectory gives the same results, as if the initial top-level +invocation reported them (if it even could). + +This functionality can be used to add support for pathspecs to implementation +that rely on Git commands that do not support submodule recursion directly. +""" + +from .pathspec import GitPathSpec diff --git a/datalad_next/gitpathspec/pathspec.py b/datalad_next/gitpathspec/pathspec.py new file mode 100644 index 00000000..7ed96ec1 --- /dev/null +++ b/datalad_next/gitpathspec/pathspec.py @@ -0,0 +1,322 @@ +# +# Intentionally written without importing datalad code +# +from __future__ import annotations + +from dataclasses import dataclass +from fnmatch import fnmatch +import posixpath +from typing import Generator + + +@dataclass(frozen=True) +class GitPathSpec: + """Support class for patterns used to limit paths in Git commands + + From the Git documentation: + + Pathspecs are used on the command line of "git ls-files", "git ls-tree", + "git add", "git grep", "git diff", "git checkout", and many other + commands to limit the scope of operations to some subset of the tree + or working tree. + + Apart from providing a dedicated type for a pathspec, the main purpose + of this functionality is to take a pathspec that is valid in the context + of one (top-level) repository, and translate it such that the set of + pathspecs given to the same command running on/in a submodule/subdirectory + gives the same results, as if the initial top-level invocation reported + them (if it even could). See the ``for_subdir()`` method for more. + + >>> # simple stripping of leading directory + >>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg') + >>> [str(i) for i in ps.for_subdir('dir')] + ['*.jpg'] + >>> # match against magic pathspecs + >>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg') + >>> # longest and shortest match are produced + >>> [str(i) for i in ps.for_subdir('dir')] + [':(glob)**r/*.jpg', ':(glob)*.jpg'] + >>> [str(i) for i in ps.for_subdir('root/some/dir')] + [':(glob)**r/*.jpg', ':(glob)*.jpg'] + >>> # support for special 'no-pathspec' pathspec + >>> ps = GitPathSpec.from_pathspec_str(':') + >>> ps.is_nopathspecs + True + + .. seealso:: + + - Entry in the Git glossary: + https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec + - Informative, more elaborate description of pathspecs: + https://css-tricks.com/git-pathspecs-and-how-to-use-them/ + """ + # TODO think about adding support for another magic that represents + # the root of a repository hierarchy (amending 'top', which is + # the root of the working tree -- but presumably for a single repository + spectypes: tuple[str, ...] + """Long-form pathspec type identifiers""" + dirprefix: str + """Directory prefix (pathspec up to the last slash) limiting the scope""" + pattern: str | None + """Pattern to match paths against using ``fnmatch``""" + + @property + def is_nopathspecs(self) -> bool: + """Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``""" + return not self.spectypes and not self.dirprefix and not self.pattern + + def __str__(self) -> str: + """Generate normalized (long-form) pathspec""" + if self.is_nopathspecs: + return ':' + ps = '' + if self.spectypes: + ps += ':(' + ps += ','.join(self.spectypes) + ps += ')' + ps += self._get_joined_pattern() + return ps + + def _get_joined_pattern(self): + return f'{self.dirprefix if self.dirprefix else ""}' \ + f'{"/" if self.dirprefix else ""}' \ + f'{self.pattern if self.pattern else ""}' + + def for_subdir(self, subdir: str) -> list[GitPathSpec]: + """Translate a pathspec into the scope of a subdirectory. + + The processing implemented here is purely lexical. This means that it + works without matching against actual file system (or Git tree) + content. Consequently, to some degree, overly broad results are + produced, but at the same time use cases are supported where there + is nothing (yet) to match against (e.g., a not-yet-cloned submodule). + + A pathspec with a ``top`` magic is produced unmodified, as there are + defined relative to the root of a repository, not relative to a base + directory. As a consequence, such pathspecs will automatically + refer to a submodule root when the target directory is contained in + one. + + Parameters + ---------- + subdir: str + Relative path in POSIX notation + + Returns + ------- + list + When an empty list is returned, this indicates that the pathsspec + cannot be translated to the given ``subdir``, because it does + not match the ``subdir`` itself. If a pathspec translates to + "no pathspecs" (``':'``), a list with a dedicated ':' pathspec is + returned. + """ + # special case of a non-translation (pretty much only here to + # make some test implementations simpler + if not subdir: + return [self] + + return list(yield_subdir_match_remainder_pathspecs(subdir, self)) + + @classmethod + def from_pathspec_str( + cls, + pathspec: str, + ) -> GitPathSpec: + """Parse a string-form pathspec into types, prefix, and pattern""" + spectypes = [] + dirprefix = None + pattern = None + + if pathspec == ':': + # shortcut for the special no-path-spec pathspec + return GitPathSpec(tuple(), '', None) + elif pathspec.startswith(':('): + # long-form magic + magic, pattern = pathspec[2:].split(')', maxsplit=1) + spectypes = magic.split(',') + elif pathspec.startswith(':'): + # short-form magic + magic_signatures = { + '/': 'top', + '!': 'exclude', + '^': 'exclude', + ':': None, + } + pattern = pathspec[1:] + spectypes = [] + for i in range(1, len(pathspec)): + sig = magic_signatures.get(pathspec[i]) + if sig is None: + pattern = pathspec[i:] + break + spectypes.append(sig) + else: + pattern = pathspec + + # raise when glob and literal magic markers are present + # simultaneously + if 'glob' in spectypes and 'literal' in spectypes: + raise ValueError( + "'glob' magic is incompatible with 'literal' magic") + + # split off dirprefix + dirprefix, pattern = GitPathSpec._split_prefix_pattern(pattern) + + return cls( + spectypes=tuple(spectypes), + dirprefix=dirprefix, + pattern=pattern, + ) + + @staticmethod + def _split_prefix_pattern(pathspec): + # > the pathspec up to the last slash represents a directory prefix. + # > The scope of that pathspec is limited to that subtree. + try: + last_slash_idx = pathspec[::-1].index('/') + except ValueError: + # everything is the pattern + dirprefix = None + pattern = pathspec + else: + dirprefix = pathspec[:-last_slash_idx - 1] + pattern = pathspec[-last_slash_idx:] \ + if last_slash_idx > 0 else None + return dirprefix, pattern + + +def yield_subdir_match_remainder_pathspecs( + subdir: str, + pathspec: GitPathSpec, +) -> Generator[GitPathSpec, None, None]: + """Translate a pathspec into a set of possible subdirectory pathspecs + + The processing implemented here is purely lexical. This means that it + works without matching against actual file system (or Git tree) content. + This means that it yields, to some degree, overly broad results, but also + that it works in cases where there is nothing (yet) to match against. + For example, a not-yet-cloned submodule. + + This function does not perform any validatity checking of pathspecs. Only + valid pathspecs and well-formed paths are supported. + + A pathspec with the ``top`` magic is returned immediately and as-is. These + pathspecs have an absolute reference and do not require a translation into + a subdirectory namespace. + + Parameters + ---------- + subdir: str + POSIX-notation relative path of a subdirectory. The reference directory + match be the same as that of the pathspec to be translated. + pathspec: GitPathSpec + To-be-translated pathspec + + Yields + ------ + GitPathSpec + Any number of pathspecs that an input pathspec decomposed into upon + translation into the namespace of a subdirectory. + """ + if 'top' in pathspec.spectypes or pathspec.is_nopathspecs: + # pathspec with an absolute reference, or "no pathspecs" + # no translation needed + yield pathspec + return + + # add a trailing directory separator to prevent undesired + # matches of partial directory names + subdir = subdir \ + if subdir.endswith('/') \ + else f'{subdir}/' + tp = pathspec._get_joined_pattern() + + if 'icase' in pathspec.spectypes: + subdir = subdir.casefold() + tp = tp.casefold() + + # literal pathspecs + if 'literal' in pathspec.spectypes: + # append a trailing slash to allow for full matches + tp_endslash = f'{tp}/' + if not tp_endslash.startswith(subdir): + # no match + # BUT + # we might have a multi-level subdir, and we might match an + # intermediate subdir and could still yield a 'no pathspec' + # result + while subdir := posixpath.split(subdir)[0]: + if tp_endslash.startswith(subdir): + yield GitPathSpec.from_pathspec_str(':') + return + return + + remainder = tp[len(subdir):] + if not remainder: + # full match + yield GitPathSpec.from_pathspec_str(':') + else: + yield GitPathSpec( + pathspec.spectypes, + *GitPathSpec._split_prefix_pattern(remainder) + ) + return + + # tokenize the testpattern using the wildcard that also matches + # directories + token_delim = '**' if 'glob' in pathspec.spectypes else '*' + tp_chunks = tp.split(token_delim) + prefix_match = '' + yielded = set() + for i, chunk in enumerate(tp_chunks): + last_chunk = i + 1 == len(tp_chunks) + if last_chunk: + trymatch = \ + f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}' + else: + trymatch = f'{prefix_match}{chunk}*' + if not fnmatch(subdir, f'{trymatch}'): + # each chunk needs match in order, first non-match ends the + # algorithm + # BUT + # we have an (initial) chunk that points already + # inside the target subdir + submatch = trymatch + while submatch := posixpath.split(submatch)[0]: + if fnmatch(f'{subdir}', f'{submatch}/'): + ps = GitPathSpec( + pathspec.spectypes, + *GitPathSpec._split_prefix_pattern( + # +1 for trailing slash + tp[len(submatch) + 1:]) + ) + if ps not in yielded: + yield ps + return + # OR + # we might have a multi-level subdir, and we might match an + # intermediate subdir and could still yield a 'no pathspec' + # result + while subdir := posixpath.split(subdir)[0]: + if fnmatch(f'{subdir}/', trymatch): + yield GitPathSpec.from_pathspec_str(':') + return + return + + remainder = tp_chunks[i + 1:] + if all(not c for c in remainder): + # direct hit, no pathspecs after translation + yield GitPathSpec.from_pathspec_str(':') + return + else: + ps = GitPathSpec( + pathspec.spectypes, + *GitPathSpec._split_prefix_pattern( + f'{token_delim}{token_delim.join(remainder)}', + ) + ) + yield ps + yielded.add(ps) + # extend prefix for the next round + prefix_match = trymatch diff --git a/datalad_next/gitpathspec/tests/__init__.py b/datalad_next/gitpathspec/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datalad_next/gitpathspec/tests/test_gitpathspec.py b/datalad_next/gitpathspec/tests/test_gitpathspec.py new file mode 100644 index 00000000..c833c4a2 --- /dev/null +++ b/datalad_next/gitpathspec/tests/test_gitpathspec.py @@ -0,0 +1,349 @@ +from pathlib import Path +import pytest +import subprocess +import sys + +from .. import ( + GitPathSpec, +) +from ..pathspec import yield_subdir_match_remainder_pathspecs + + +def _list_files(path, pathspecs): + return [ + i for i in subprocess.run( + ['git', 'ls-files', '-z', '--other', '--', *pathspecs], + capture_output=True, + cwd=path, + ).stdout.decode('utf-8').split('\0') + if i + ] + + +@pytest.fixture(scope="function") +def pathspec_match_testground(tmp_path_factory): + """Create a Git repo with no commit and many untracked files + + In this playground, `git ls-files --other` can be used to testrun + pathspecs. + + See the top item in `testcases` for a summary of the content + """ + p = tmp_path_factory.mktemp('pathspec_match') + probe = p / 'pr?be' + # check for case insensitive file systems + crippled_fs = Path(str(p).upper()).exists() + try: + probe.touch() + probe.unlink() + except OSError: + crippled_fs = True + + subprocess.run(['git', 'init'], cwd=p, check=True) + p_sub = p / 'sub' + p_sub.mkdir() + for d in (p, p_sub): + p_a = d / 'aba' + p_b = d / 'a?a' + for sp in (p_a,) if crippled_fs else (p_a, p_b): + sp.mkdir() + for fname in ('a.txt', 'A.txt', 'a.JPG'): + (sp / fname).touch() + # add something that is unique to sub/ + (p_sub / 'b.dat').touch() + + testcases = [ + # valid + dict( + ps=':', + fordir={ + None: {'specs': [':'], + 'match': [ + 'aba/a.JPG', 'aba/a.txt', + 'sub/aba/a.JPG', 'sub/aba/a.txt', + 'sub/b.dat'] if crippled_fs else [ + 'a?a/A.txt', 'a?a/a.JPG', 'a?a/a.txt', + 'aba/A.txt', 'aba/a.JPG', 'aba/a.txt', + 'sub/a?a/A.txt', 'sub/a?a/a.JPG', 'sub/a?a/a.txt', + 'sub/aba/A.txt', 'sub/aba/a.JPG', 'sub/aba/a.txt', + 'sub/b.dat'], + }, + 'sub': {'specs': [':'], + 'match': [ + 'aba/a.JPG', 'aba/a.txt', + 'b.dat'] if crippled_fs else [ + 'a?a/A.txt', 'a?a/a.JPG', 'a?a/a.txt', + 'aba/A.txt', 'aba/a.JPG', 'aba/a.txt', + 'b.dat'], + }, + }, + ), + dict( + ps='aba', + fordir={ + None: {'match': [ + 'aba/a.JPG', 'aba/a.txt', + ] if crippled_fs else [ + 'aba/A.txt', 'aba/a.JPG', 'aba/a.txt'], + }, + 'aba': {'specs': [':'], + 'match': [ + 'a.JPG', 'a.txt'] if crippled_fs else [ + 'A.txt', 'a.JPG', 'a.txt'], + }, + }, + ), + # same as above, but with a trailing slash + dict( + ps='aba/', + fordir={ + None: {'match': [ + 'aba/a.JPG', 'aba/a.txt', + ] if crippled_fs else [ + 'aba/A.txt', 'aba/a.JPG', 'aba/a.txt'], + }, + 'aba': {'specs': [':'], + 'match': [ + 'a.JPG', 'a.txt'] if crippled_fs else [ + 'A.txt', 'a.JPG', 'a.txt'], + }, + }, + ), + # TODO same as above, but as a literal + + dict( + ps=':(glob)aba/*.txt', + fordir={ + None: {'match': [ + 'aba/a.txt', + ] if crippled_fs else ['aba/A.txt', 'aba/a.txt']}, + 'sub': {'specs': []}, + }, + ), + dict( + ps=':/aba/*.txt', + norm=':(top)aba/*.txt', + fordir={ + None: {'match': [ + 'aba/a.txt', + ] if crippled_fs else ['aba/A.txt', 'aba/a.txt']}, + # for a subdir a keeps matching the exact same items + # not only be name, but by location + 'sub': {'specs': [':(top)aba/*.txt'], + 'match': ['../aba/a.txt'] if crippled_fs else [ + '../aba/A.txt', '../aba/a.txt']}, + }, + ), + dict( + ps='aba/*.txt', + fordir={ + None: {'match': ['aba/a.txt'] if crippled_fs else [ + 'aba/A.txt', 'aba/a.txt'], + }, + # not applicable + 'sub': {'specs': []}, + # but this is + 'aba': {'specs': ['*.txt']}, + }, + ), + dict( + ps='sub/aba/*.txt', + fordir={ + None: {'match': ['sub/aba/a.txt'] if crippled_fs else [ + 'sub/aba/A.txt', 'sub/aba/a.txt']}, + 'sub': {'specs': ['aba/*.txt'], + 'match': ['aba/a.txt'] if crippled_fs else [ + 'aba/A.txt', 'aba/a.txt']}, + }, + ), + dict( + ps='*.JPG', + fordir={ + None: {'match': [ + 'aba/a.JPG', 'sub/aba/a.JPG'] if crippled_fs else [ + 'a?a/a.JPG', 'aba/a.JPG', 'sub/a?a/a.JPG', + 'sub/aba/a.JPG']}, + # unchanged + 'sub': {'specs': ['*.JPG']}, + }, + ), + dict( + ps='*ba*.JPG', + fordir={ + None: {'match': ['aba/a.JPG', 'sub/aba/a.JPG']}, + 'aba': {'specs': ['*ba*.JPG', '*.JPG'], + 'match': ['a.JPG']}, + }, + ), + # invalid + # + # conceptual conflict and thereby unsupported by Git + # makes sense and is easy to catch that + dict(ps=':(glob,literal)broken', raises=ValueError), + ] + if not crippled_fs: + testcases.extend([ + # literal magic is only needed for non-crippled FS + dict( + ps=':(literal)a?a/a.JPG', + fordir={ + None: dict( + match=['a?a/a.JPG'], + ), + "a?a": dict( + specs=[':(literal)a.JPG'], + match=['a.JPG'], + ), + }, + ), + dict( + ps=':(literal,icase)SuB/A?A/a.jpg', + fordir={ + None: {'match': ['sub/a?a/a.JPG']}, + "sub/a?a": { + 'specs': [':(literal,icase)a.jpg'], + # given the spec transformation matches + # MIH would really expect to following, + # but it is not coming from Git :( + #'match': ['a.JPG'], + 'match': [], + }, + }, + ), + dict( + ps=':(icase)A?A/a.jpg', + fordir={ + None: {'match': ['a?a/a.JPG', 'aba/a.JPG']}, + "aba": { + 'specs': [':(icase)a.jpg'], + 'match': ['a.JPG'], + }, + }, + ), + dict( + ps=':(literal,icase)A?A/a.jpg', + fordir={ + None: {'match': ['a?a/a.JPG']}, + "a?a": { + 'specs': [':(literal,icase)a.jpg'], + 'match': ['a.JPG'], + }, + # the target subdir does not match the pathspec + "aba": {'specs': set()}, + }, + ), + ]) + + yield p, testcases + + +def test_pathspecs(pathspec_match_testground): + tg, testcases = pathspec_match_testground + + for testcase in testcases: + if testcase.get('raises'): + # test case states how `GitPathSpec` will blow up + # on this case. Verify and skip any further testing + # on this case + with pytest.raises(testcase['raises']): + GitPathSpec.from_pathspec_str(testcase['ps']) + continue + # create the instance + ps = GitPathSpec.from_pathspec_str(testcase['ps']) + # if no deviating normalized representation is given + # it must match the original one + assert str(ps) == testcase.get('norm', testcase['ps']) + # test translations onto subdirs now + # `None` is a special subdir that means "self", i.e. + # not translation other than normalization, we can use it + # to test matching behavior of the full pathspec + for subdir, target in testcase.get('fordir', {}).items(): + # translate -- a single input pathspec can turn into + # multiple translated ones. This is due to + subdir_specs = [str(s) for s in ps.for_subdir(subdir)] + if 'specs' in target: + assert set(subdir_specs) == set(target['specs']), \ + f'Mismatch for {testcase["ps"]!r} -> subdir {subdir!r} {target}' + if subdir and not target.get('specs') and 'match' in target: + raise ValueError( + 'invalid test specification: no subdir specs expected, ' + f'but match declared: {testcase!r}') + if subdir_specs and 'match' in target: + tg_subdir = tg / subdir if subdir else tg + assert _list_files(tg_subdir, subdir_specs) == target['match'] + + +def test_yield_subdir_match_remainder_pathspecs(): + testcases = [ + # FORMAT: target path, pathspec, subdir pathspecs + ('abc', ':', [':']), + # top-magic is returned as-is + ('murks', ':(top)crazy*^#', [':(top)crazy*^#']), + # no match + ('abc', 'not', []), + ('abc', 'ABC', [':'] if sys.platform.startswith('win') else []), + # direct hits, resolve to "no pathspecs" + ('abc', 'a?c', [':']), + ('abc', 'abc', [':']), + ('abc', 'abc/', [':']), + # icase-magic + ('abc', ':(icase)ABC', [':']), + ('ABC', ':(icase)abc', [':']), + # some fairly common fnmatch-style pathspec + ('abc', 'abc/*.jpg', ['*.jpg']), + ('abc', '*.jpg', ['*.jpg']), + ('abc', '*/*.jpg', ['*/*.jpg', '*.jpg']), + ('abc', '*/*.jpg', ['*/*.jpg', '*.jpg']), + ('abc', '*bc*.jpg', ['*bc*.jpg', '*.jpg']), + # adding an glob-unrelated magic does not impact the result + ('abc', ':(exclude)*/*.jpg', [':(exclude)*/*.jpg', ':(exclude)*.jpg']), + ('abc', ':(attr:export-subst)*/*.jpg', + [':(attr:export-subst)*/*.jpg', ':(attr:export-subst)*.jpg']), + ('abc', ':(icase,exclude)*/*.jpg', + [':(icase,exclude)*/*.jpg', ':(icase,exclude)*.jpg']), + # glob-magic + ('abc', ':(glob)*bc*.jpg', []), + ('abc', ':(glob)*bc**.jpg', [':(glob)**.jpg']), + # 2nd-level subdir + ('abc/123', 'some.jpg', []), + ('abc/123', '*.jpg', ['*.jpg']), + ('abc/123', 'abc/*', [':']), + ('abc/123', 'abc', [':']), + ('abc/123', ':(glob)abc', [':']), + ('abc/123', '*123', ['*123', ':']), + ('abc/123', '*/123', ['*/123', ':']), + ('abc/123', ':(glob)*/123', [':']), + # literal-magic + ('abc', ':(literal)a?c', []), + ('a?c', ':(literal)a?c', [':']), + ('a?c', ':(literal)a?c/*?ab*', [':(literal)*?ab*']), + ('a?c/123', ':(literal)a?c', [':']), + # more complex cases + ('abc/123/ABC', 'a*/1?3/*.jpg', + ['*/1?3/*.jpg', '*.jpg', '1?3/*.jpg']), + # exclude-magic + ('abc', ':(exclude)abc', [':']), + ('abc/123', ':(exclude)abc', [':']), + ('a?c', ':(exclude,literal)a?c', [':']), + # stuff that was problematic at some point + # initial, non-wildcard part already points inside the + # target directory + ('sub', 'sub/aba/*.txt', ['aba/*.txt']), + # no directory-greedy wildcard whatsoever + ('abc', ':(icase)A?C/a.jpg', [':(icase)a.jpg']), + # no directory-greedy wildcard in later chunk + ('nope/abc', 'no*/a?c/a.jpg', ['*/a?c/a.jpg', 'a.jpg']), + ] + for ts in testcases: + # always test against the given subdir, and also against the subdir + # given with a trailing slash + for target_path in (ts[0], f'{ts[0]}/'): + tsps = GitPathSpec.from_pathspec_str(ts[1]) + remainders = list( + yield_subdir_match_remainder_pathspecs( + target_path, + tsps, + ) + ) + assert [str(ps) for ps in remainders] == ts[2], \ + f'Mismatch for {ts}'