From 89c4ee92e7d538d507e4077fc1789d9cfc6ed228 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Thu, 11 Jan 2024 18:01:13 +0100 Subject: [PATCH] Start of a `next-status` command This implementation is the first to emit `CommandResult` type result items, ie. dataclass instances rather than result dicts. It also uses uniform parameter validation, enabling substantially simplified implementation (e.g., of the result renderer). The user-facing appearance remains (largely) the same. The command documentation contains a summary of the key differences. Closes #586 --- datalad_next/__init__.py | 4 + datalad_next/commands/__init__.py | 1 + datalad_next/commands/status.py | 375 ++++++++++++++ datalad_next/commands/tests/test_status.py | 55 ++ datalad_next/iter_collections/gitdiff.py | 2 +- datalad_next/iter_collections/gitstatus.py | 488 +++++++++++++++--- .../tests/test_itergitstatus.py | 349 +++++++++++++ docs/source/api.rst | 1 + docs/source/cmd.rst | 1 + 9 files changed, 1217 insertions(+), 59 deletions(-) create mode 100644 datalad_next/commands/status.py create mode 100644 datalad_next/commands/tests/test_status.py create mode 100644 datalad_next/iter_collections/tests/test_itergitstatus.py diff --git a/datalad_next/__init__.py b/datalad_next/__init__.py index 116df896c..86b879839 100644 --- a/datalad_next/__init__.py +++ b/datalad_next/__init__.py @@ -43,6 +43,10 @@ 'datalad_next.commands.ls_file_collection', 'LsFileCollection', 'ls-file-collection', ), + ( + 'datalad_next.commands.status', 'Status', + 'next-status', 'next_status', + ), ] ) diff --git a/datalad_next/commands/__init__.py b/datalad_next/commands/__init__.py index 36de84564..c7cfe99b5 100644 --- a/datalad_next/commands/__init__.py +++ b/datalad_next/commands/__init__.py @@ -14,6 +14,7 @@ CommandResult CommandResultStatus + status.StatusResult """ from __future__ import annotations diff --git a/datalad_next/commands/status.py b/datalad_next/commands/status.py new file mode 100644 index 000000000..61af15911 --- /dev/null +++ b/datalad_next/commands/status.py @@ -0,0 +1,375 @@ +""" +""" +from __future__ import annotations + +__docformat__ = 'restructuredtext' + +from dataclasses import dataclass +from enum import Enum +from logging import getLogger +from pathlib import Path +from typing import Generator + +from datalad_next.commands import ( + CommandResult, + CommandResultStatus, + EnsureCommandParameterization, + ValidatedInterface, + Parameter, + ParameterConstraintContext, + build_doc, + datasetmethod, + eval_results, +) +from datalad_next.constraints import ( + EnsureChoice, + WithDescription, +) +from datalad_next.constraints.dataset import EnsureDataset + +from datalad_next.iter_collections.gitdiff import ( + GitDiffStatus, + GitTreeItemType, + GitContainerModificationType, +) +from datalad_next.iter_collections.gitstatus import ( + iter_gitstatus, +) +from datalad_next.uis import ( + ui_switcher as ui, + ansi_colors as ac, +) + +lgr = getLogger('datalad.core.local.status') + + +# TODO Could be `StrEnum`, came with PY3.11 +class StatusState(Enum): + """Enumeration of possible states of a status command result + + The "state" is the condition of the dataset item being reported + on. + """ + clean = 'clean' + added = 'added' + modified = 'modified' + deleted = 'deleted' + untracked = 'untracked' + unknown = 'unknown' + + +STATE_COLOR_MAP = { + StatusState.added: ac.GREEN, + StatusState.modified: ac.RED, + StatusState.deleted: ac.RED, + StatusState.untracked: ac.RED, + StatusState.unknown: ac.YELLOW, +} + + +diffstatus2resultstate_map = { + GitDiffStatus.addition: StatusState.added, + GitDiffStatus.copy: StatusState.added, + GitDiffStatus.deletion: StatusState.deleted, + GitDiffStatus.modification: StatusState.modified, + GitDiffStatus.rename: StatusState.added, + GitDiffStatus.typechange: StatusState.modified, + GitDiffStatus.unmerged: StatusState.unknown, + GitDiffStatus.unknown: StatusState.unknown, + GitDiffStatus.other: StatusState.untracked, +} + + +# see base class decorator comment for why this is commented out +#@dataclass(kw_only=True) +@dataclass +class StatusResult(CommandResult): + # TODO any of the following property are not actually optional + # we only have to declare them such for limitations of dataclasses + # prior PY3.10 (see kw_only command in base class + + diff_state: GitDiffStatus | None = None + """The ``status`` of the underlying ``GitDiffItem``. It is named + "_state" to emphasize the conceptual similarity with the legacy + property 'state' + """ + gittype: GitTreeItemType | None = None + """The ``gittype`` of the underlying ``GitDiffItem``.""" + prev_gittype: GitTreeItemType | None = None + """The ``prev_gittype`` of the underlying ``GitDiffItem``.""" + modification_types: tuple[GitContainerModificationType] | None = None + """Qualifiers for modification types of container-type + items (directories, submodules).""" + + @property + def state(self) -> StatusState: + """A (more or less legacy) simplified representation of the subject + state. For a more accurate classification use the ``diff_status`` + property. + """ + return diffstatus2resultstate_map[self.diff_state] + + # the previous status-implementation did not report plain git-types + # we establish a getter to perform this kind of inference/mangling, + # when it is needed + @property + def type(self) -> str | None: + """ + """ + # TODO this is just a placeholder + return self.gittype.value if self.gittype else None + + # we need a setter for this `type`-override stunt + @type.setter + def type(self, value): + self.gittype = value + + @property + def prev_type(self) -> str: + """ + """ + return self.prev_gittype.value if self.prev_gittype else None + + @property + def type_src(self) -> str | None: + """Backward-compatibility adaptor""" + return self.prev_type + + +opt_untracked_values = ('no', 'whole-dir', 'no-empty-dir', 'normal', 'all') +opt_recursive_values = ('no', 'repository', 'datasets', 'mono') +opt_eval_subdataset_state_values = ('no', 'commit', 'full') + + +class StatusParamValidator(EnsureCommandParameterization): + def __init__(self): + super().__init__( + param_constraints=dict( + # if given, it must also exist + dataset=EnsureDataset(installed=True), + untracked=EnsureChoice(*opt_untracked_values), + recursive=EnsureChoice(*opt_recursive_values), + eval_subdataset_state=EnsureChoice( + *opt_eval_subdataset_state_values) + ), + validate_defaults=('dataset',), + joint_constraints={ + ParameterConstraintContext(('untracked', 'recursive'), + 'option normalization'): + self.normalize_options, + }, + ) + + def normalize_options(self, **kwargs): + if kwargs['untracked'] == 'no': + kwargs['untracked'] = None + if kwargs['untracked'] == 'normal': + kwargs['untracked'] = 'no-empty-dir' + if kwargs['recursive'] == 'datasets': + kwargs['recursive'] = 'submodules' + if kwargs['recursive'] == 'mono': + kwargs['recursive'] = 'monolithic' + return kwargs + + +@build_doc +class Status(ValidatedInterface): + """Report on the (modification) status of a dataset + + .. note:: + + This is a preview of an command implementation aiming to replace + the DataLad ``status`` command. + + For now, expect anything here to change again. + + This command provides a report that is roughly identical to that of + ``git status``. Running with default parameters yields a report that + should look familiar to Git and DataLad users alike, and contain + the same information as offered by ``git status``. + + The main difference to ``git status`` are: + + - Support for recursion into submodule. ``git status`` does that too, + but the report is limited to the global state of an entire submodule, + whereas this command can issue detailed reports in changes inside + a submodule (any nesting depth). + + - Support for directory-constrained reporting. Much like ``git status`` + limits its report to a single repository, this command can optionally + limit its report to a single directory and its direct children. In this + report subdirectories are considered containers (much like) submodules, + and a change summary is provided for them. + + - Support for a "mono" (monolithic repository) report. Unlike a standard + recursion into submodules, and checking each of them for changes with + respect to the HEAD commit of the worktree, this report compares a + submodule with respect to the state recorded in its parent repository. + This provides an equally comprehensive status report from the point of + view of a queried repository, but does not include a dedicated item on + the global state of a submodule. This makes nested hierarchy of + repositories appear like a single (mono) repository. + + - Support for "adjusted mode" git-annex repositories. These utilize a + managed branch that is repeatedly rewritten, hence is not suitable + for tracking within a parent repository. Instead, the underlying + "corresponding branch" is used, which contains the equivalent content + in an un-adjusted form, persistently. This command detects this condition + and automatically check a repositories state against the corresponding + branch state. + + *Presently missing/planned features* + + - There is no support for specifying paths (or pathspecs) for constraining + the operation to specific dataset parts. This will be added in the + future. + + - There is no reporting of git-annex properties, such as tracked file size. + It is undetermined whether this will be added in the future. However, + even without a dedicated switch, this command has support for + datasets (and their submodules) in git-annex's "adjusted mode". + + *Differences to the ``status`` command implementation prior DataLad v2* + + - Like ``git status`` this implementation reports on dataset modification, + whereas the previous ``status`` also provided a listing of unchanged + dataset content. This is no longer done. Equivalent functionality for + listing dataset content is provided by the ``ls_file_collection`` + command. + - The implementation is substantially faster. Depending on the context + the speed-up is typically somewhere between 2x and 100x. + - The implementation does not suffer from the limitation re type change + detection. + - Python and CLI API of the command use uniform parameter validation. + """ + # Interface.validate_args() will inspect this dict for the presence of a + # validator for particular parameters + _validator_ = StatusParamValidator() + + # this is largely here for documentation and CLI parser building + _params_ = dict( + dataset=Parameter( + args=("-d", "--dataset"), + doc="""Dataset to be used as a configuration source. Beyond + reading configuration items, this command does not interact with + the dataset."""), + untracked=Parameter( + args=('--untracked',), + choices=opt_untracked_values, + doc="""Determine how untracked content is considered and reported + when comparing a revision to the state of the working tree. + 'no': no untracked content is considered as a change; + 'normal': untracked files and entire untracked directories are + reported as such; + 'all': report individual files even in fully untracked directories. + In addition to these git-status modes, + 'whole-dir' (like normal, but include empty directories), and + 'no-empty-dir' (alias for 'normal') are understood."""), + recursive=Parameter( + args=('-r', '--recursive'), + nargs='?', + const='datasets', + choices=opt_recursive_values, + doc="""Mode of recursion for status reporting. + With 'no' the report is restricted to a single directory and + its direct children. + With 'repository', the report comprises all repository content + underneath current working directory or root of a given dataset, + but is limited to items directly contained in that repository. + With 'datasets', the report also comprises any content in any + subdatasets. Each subdataset is evaluated against its respective + HEAD commit. + With 'mono', a report similar to 'datasets' is generated, but + any subdataset is evaluate with respect to the state recorded + in its parent repository. In constrast to the 'datasets' mode, + no report items on a joint submodule are generated. + [CMD: If no particular value is given with this option the + 'datasets' mode is selected. CMD] + """), + eval_subdataset_state=Parameter( + args=("-e", "--eval-subdataset-state",), + choices=opt_eval_subdataset_state_values, + doc="""Evaluation of subdataset state (modified or untracked + content) can be expensive for deep dataset hierarchies + as subdataset have to be tested recursively for + uncommitted modifications. Setting this option to + 'no' or 'commit' can substantially boost performance + by limiting what is being tested. + With 'no' no state is evaluated and subdataset are not + investigated for modifications. + With 'commit' only a discrepancy of the HEAD commit + gitsha of a subdataset and the gitsha recorded in the + superdataset's record is evaluated. + With 'full' any other modifications are considered + too."""), + ) + + _examples_ = [ + ] + + @staticmethod + @datasetmethod(name="next_status") + @eval_results + def __call__( + # TODO later + #path=None, + *, + dataset=None, + # TODO possibly later + #annex=None, + untracked='normal', + recursive='repository', + eval_subdataset_state='full', + ) -> Generator[StatusResult, None, None] | list[StatusResult]: + ds = dataset.ds + rootpath = Path.cwd() if dataset.original is None else ds.pathobj + + for item in iter_gitstatus( + path=rootpath, + untracked=untracked, + recursive=recursive, + eval_submodule_state=eval_subdataset_state, + ): + yield StatusResult( + action='status', + status=CommandResultStatus.ok, + path=rootpath / (item.path or item.prev_path), + gittype=item.gittype, + prev_gittype=item.prev_gittype, + diff_state=item.status, + modification_types=item.modification_types, + refds=ds, + logger=lgr, + ) + + def custom_result_renderer(res, **kwargs): + # we are guaranteed to have dataset-arg info through uniform + # parameter validation + dsarg = kwargs['dataset'] + rootpath = Path.cwd() if dsarg.original is None else dsarg.ds.pathobj + # because we can always determine the root path of the command + # execution environment, we can report meaningful relative paths + # unconditionally + path = res.path.relative_to(rootpath) + # collapse item type information across current and previous states + type_ = res.type or res.prev_type or '' + max_len = len('untracked') + state = res.state.value + # message format is same as for previous command implementation + ui.message(u'{fill}{state}: {path}{type_}{annot}'.format( + fill=' ' * max(0, max_len - len(state)), + state=ac.color_word( + res.state.value, + STATE_COLOR_MAP.get(res.state)), + path=path, + type_=' ({})'.format(ac.color_word(type_, ac.MAGENTA)) + if type_ else '', + annot=f' [{", ".join(q.value for q in res.modification_types)}]' + if res.modification_types else '', + )) + + @staticmethod + def custom_result_summary_renderer(results): + # no reports, no changes + if len(results) == 0: + ui.message("nothing to save, working tree clean") diff --git a/datalad_next/commands/tests/test_status.py b/datalad_next/commands/tests/test_status.py new file mode 100644 index 000000000..55b91d244 --- /dev/null +++ b/datalad_next/commands/tests/test_status.py @@ -0,0 +1,55 @@ +import pytest + +from datalad.api import next_status + +from datalad_next.constraints.exceptions import ( + CommandParametrizationError, + ParameterConstraintContext, +) +from datalad_next.tests.utils import chpwd + +from ..status import ( + opt_eval_subdataset_state_values, + opt_recursive_values, + opt_untracked_values, +) + + +def test_status_invalid(tmp_path, datalad_cfg): + # we want exhaustive parameter validation (i.e., continue after + # first failure), saves some code here + datalad_cfg.set('datalad.runtime.parameter-violation', + 'raise-at-end', + scope='global') + with chpwd(tmp_path): + with pytest.raises(CommandParametrizationError) as e: + next_status( + untracked='weird', + recursive='upsidedown', + eval_subdataset_state='moonphase', + ) + errors = e.value.errors + assert 'no dataset found' in \ + errors[ParameterConstraintContext(('dataset',))].msg.casefold() + for opt in ('untracked', 'recursive', 'eval_subdataset_state'): + assert 'is not one of' in \ + errors[ParameterConstraintContext((opt,))].msg.casefold() + + +def test_status_renderer_smoke(existing_dataset): + ds = existing_dataset + assert ds.next_status() == [] + (ds.pathobj / 'untracked').touch() + st = ds.next_status() + assert len(st) == 1 + + +def test_status_clean(existing_dataset, no_result_rendering): + ds = existing_dataset + ds.create('subds') + for recmode in opt_recursive_values: + assert [] == ds.next_status(recursive=recmode) + for untracked in opt_untracked_values: + assert [] == ds.next_status(untracked=untracked) + for eval_sm in opt_eval_subdataset_state_values: + assert [] == ds.next_status(eval_subdataset_state=eval_sm) diff --git a/datalad_next/iter_collections/gitdiff.py b/datalad_next/iter_collections/gitdiff.py index 7bf71744b..346c53b3c 100644 --- a/datalad_next/iter_collections/gitdiff.py +++ b/datalad_next/iter_collections/gitdiff.py @@ -113,7 +113,7 @@ def iter_gitdiff( find_copies: int | None = None, yield_tree_items: str | None = None, # TODO add documentation - eval_submodule_state: str = 'commit', + eval_submodule_state: str = 'full', ) -> Generator[GitDiffItem, None, None]: """Report differences between Git tree-ishes or tracked worktree content diff --git a/datalad_next/iter_collections/gitstatus.py b/datalad_next/iter_collections/gitstatus.py index 66d6f78f6..e75627888 100644 --- a/datalad_next/iter_collections/gitstatus.py +++ b/datalad_next/iter_collections/gitstatus.py @@ -7,17 +7,32 @@ import logging from pathlib import ( Path, - PurePosixPath, + PurePath, ) from typing import Generator +from datalad_next.runners import ( + CommandError, + iter_git_subproc, +) +from datalad_next.itertools import ( + decode_bytes, + itemize, +) +from datalad_next.repo_utils import ( + get_worktree_head, + iter_submodules, +) + from .gitdiff import ( GitDiffItem, GitDiffStatus, - GitTreeItemType, + GitContainerModificationType, iter_gitdiff, ) from .gitworktree import ( + GitTreeItem, + GitTreeItemType, iter_gitworktree, lsfiles_untracked_args, _git_ls_files, @@ -31,9 +46,17 @@ def iter_gitstatus( *, untracked: str | None = 'all', recursive: str = 'repository', - yield_tree_items: str | None = None, + eval_submodule_state: str = "full", ) -> Generator[GitDiffItem, None, None]: """ + Recursion mode 'no' + + This mode limits the reporting to immediate directory items of a given + path. This mode is not necessarily faster than a 'repository' recursion. + Its primary purpose is the ability to deliver a collapsed report in that + subdirectories are treated similar to submodules -- as containers that + maybe have modified or untracked content. + Parameters ---------- path: Path @@ -47,20 +70,23 @@ def iter_gitstatus( ``all`` reports on any untracked file; ``whole-dir`` yields a single report for a directory that is entirely untracked, and not individual untracked files in it; ``no-empty-dir`` skips any reports on - untracked empty directories. - recursive: {'repository', 'submodules', 'no'}, optional + untracked empty directories. Also see ``eval_submodule_state`` for + how this parameter is applied in submodule recursion. + recursive: {'no', 'repository', 'submodules', 'monolithic'}, optional Behavior for recursion into subtrees. By default (``repository``), all trees within the repository underneath ``path``) are reported, but no tree within submodules. With ``submodules``, recursion includes any submodule that is present. If ``no``, only direct children are reported on. - yield_tree_items: {'submodules', 'directories', 'all', None}, optional - Whether to yield an item on type of subtree that will also be recursed - into. For example, a submodule item, when submodule recursion is - enabled. When disabled, subtree items (directories, submodules) - will still be reported whenever there is no recursion into them. - For example, submodule items are reported when - ``recursive='repository``, even when ``yield_tree_items=None``. + eval_submodule_state: {"no", "commit", "full"}, optional + If 'full' (default), the state of a submodule is evaluated by + considering all modifications, with the treatment of untracked files + determined by `untracked`. If 'commit', the modification check is + restricted to comparing the submodule's "HEAD" commit to the one + recorded in the superdataset. If 'no', the state of the subdataset is + not evaluated. When a git-annex repository in adjusted mode is detected, + the reference commit that the worktree is being compared to is the basis + of the adjusted branch (i.e., the corresponding branch). Yields ------ @@ -71,72 +97,274 @@ def iter_gitstatus( """ path = Path(path) - if untracked is None: - # we can delegate all of this - yield from iter_gitdiff( - path, - from_treeish='HEAD', - # to the worktree - to_treeish=None, - recursive=recursive, - yield_tree_items=yield_tree_items, - ) + head, corresponding_head = get_worktree_head(path) + # TODO it would make sense to always (or optionally) compare against any + # existing corresponding_head. This would make the status communicate + # anything that has not made it into the corresponding branch yet + + common_args = dict( + head=head, + path=path, + untracked=untracked, + eval_submodule_state=eval_submodule_state, + ) + + if recursive == 'no': + yield from _yield_dir_items(**common_args) return + elif recursive == 'repository': + yield from _yield_repo_items(**common_args) + # TODO what we really want is a status that is not against a per-repository + # HEAD, but against the commit that is recorded in the parent repository + # TODO we need a name for that + elif recursive in ('submodules', 'monolithic'): + yield from _yield_hierarchy_items( + recursion_mode=recursive, + **common_args, + ) + else: + raise ValueError(f'unknown recursion type {recursive!r}') + - # limit to within-repo, at most - recmode = 'repository' if recursive == 'submodules' else recursive +# +# status generators for each mode +# - # we always start with a repository-contrained diff against the worktree - # tracked content - for item in iter_gitdiff( +def _yield_dir_items( + *, + head: str | None, + path: Path, + untracked: str | None, + eval_submodule_state: str, +): + # potential container items in a directory that need content + # investigation + container_types = ( + GitTreeItemType.directory, + GitTreeItemType.submodule, + ) + if untracked == 'no': + # no need to look at anything other than the diff report + dir_items = {} + else: + # there is no recursion, avoid wasting cycles on listing individual + # files in subdirectories + untracked = 'whole-dir' if untracked == 'all' else untracked + # gather all dierectory items upfront, we subtract the ones reported + # modified later and lastly yield all untracked content from them + dir_items = { + str(item.name): item + for item in iter_gitworktree( + path, + untracked=untracked, + recursive='no', + ) + } + # diff constrained to direct children + for item in ([] if head is None else iter_gitdiff( path, from_treeish='HEAD', # to the worktree to_treeish=None, - recursive=recmode, - yield_tree_items=yield_tree_items, - ): - # TODO when recursive==submodules, do not yield present - # items of present submodules unless yield_tree_items says so - yield item - - # now untracked files of this repo - assert untracked is not None - yield from _yield_repo_untracked(path, untracked) + recursive='no', + # TODO trim scope like in repo_items + eval_submodule_state=eval_submodule_state, + )): + if item.status != GitDiffStatus.deletion \ + and item.gittype in container_types: + if item.gittype == GitTreeItemType.submodule: + # issue standard submodule container report + _eval_submodule(path, item, eval_submodule_state) + else: + dir_path = path / item.path + # this is on a directory. if it appears here, it has + # modified content + if dir_path.exists(): + item.add_modification_type( + GitContainerModificationType.modified_content) + if untracked != 'no' \ + and _path_has_untracked(path / item.path): + item.add_modification_type( + GitContainerModificationType.untracked_content) + else: + # this directory is gone entirely + item.status = GitDiffStatus.deletion + item.modification_types = None + # we dealt with this item completely + dir_items.pop(item.name, None) + if item.status: + yield item - if recursive != 'submodules': - # all other modes of recursion have been dealt with + if untracked == 'no': return - # at this point, we know we need to recurse into submodule, and we still - # have to report on untracked files -> scan the worktree - for item in iter_gitworktree( + # yield anything untracked, and inspect remaining containers + for dir_item in dir_items.values(): + if dir_item.gitsha is None and dir_item.gittype is None: + # this is untracked + yield GitDiffItem( + # for homgeneity for report a str-path no matter what + name=str(dir_item.name), + status=GitDiffStatus.other, + ) + elif dir_item.gittype in container_types: + # none of these containers has any modification other than + # possibly untracked content + item = GitDiffItem( + # for homgeneity for report a str-path no matter what + name=str(dir_item.name), + # this submodule has not been detected as modified + # per-commit, assign reported gitsha to pre and post + # state + gitsha=dir_item.gitsha, + prev_gitsha=dir_item.gitsha, + gittype=dir_item.gittype, + # TODO others? + ) + if item.gittype == GitTreeItemType.submodule: + # issue standard submodule container report + _eval_submodule(path, item, eval_submodule_state) + else: + # this is on a directory. if it appears here, it has + # no modified content + if _path_has_untracked(path / dir_item.path): + item.status = GitDiffStatus.modification + item.add_modification_type( + GitContainerModificationType.untracked_content) + if item.status: + yield item + + +def _yield_repo_items( + *, + head: str | None, + path: Path, + untracked: str | None, + eval_submodule_state: str, +) -> Generator[GitDiffItem, None, None]: + """Report status items for a single/whole repsoitory""" + present_submodules = { + # stringify name for speedy comparison + # TODO double-check that comparisons are primarily with + # GitDiffItem.name which is str + str(item.name): item for item in iter_submodules(path) + } + # start with a repository-contrained diff against the worktree + for item in ([] if head is None else iter_gitdiff( path, - untracked=None, - link_target=False, - fp=False, - # singledir mode has been ruled out above, - # we need to find all submodules + from_treeish='HEAD', + # to the worktree + to_treeish=None, recursive='repository', + # we should be able to go cheaper with the submodule evaluation here. + # We need to redo some check for adjusted mode, and other cases anyways + eval_submodule_state='commit' + if eval_submodule_state == 'full' else eval_submodule_state, + )): + # immediately investigate any submodules that are already + # reported modified by Git + if item.gittype == GitTreeItemType.submodule: + _eval_submodule(path, item, eval_submodule_state) + # we dealt with this submodule + present_submodules.pop(item.name, None) + if item.status: + yield item + + # we are not generating a recursive report for submodules, hence + # we need to look at ALL submodules for untracked content + # `or {}` for the case where we got no submodules, which happens + # with `eval_submodule_state == 'no'` + for subm_name, subm_item in (present_submodules or {}).items(): + # none of these submodules has any modification other than + # possibly untracked content + item = GitDiffItem( + # for homgeneity for report a str-path no matter what + name=str(subm_item.name), + # this submodule has not been detected as modified + # per-commit, assign reported gitsha to pre and post + # state + gitsha=subm_item.gitsha, + prev_gitsha=subm_item.gitsha, + gittype=subm_item.gittype, + # TODO others? + ) + # TODO possibly trim eval_submodule_state + _eval_submodule(path, item, eval_submodule_state) + if item.status: + yield item + + if untracked == 'no': + return + + # lastly untracked files of this repo + yield from _yield_repo_untracked(path, untracked) + + +def _yield_hierarchy_items( + *, + head: str | None, + path: Path, + untracked: str | None, + recursion_mode: str, + eval_submodule_state: str, +) -> Generator[GitDiffItem, None, None]: + for item in _yield_repo_items( + head=head, + path=path, + untracked=untracked, + # TODO do we need to adjust the eval mode here for the diff recmodes? + eval_submodule_state=eval_submodule_state, ): - if item.gittype != GitTreeItemType.submodule \ - or item.name == PurePosixPath('.'): - # either this is no submodule, or a submodule that was found at - # the root path -- which would indicate that the submodule - # itself it not around, only its record in the parent + # we get to see any submodule item passing through here, and can simply + # call this function again for a subpath + if item.gittype != GitTreeItemType.submodule: + yield item continue - for i in iter_gitstatus( - # the .path of a GitTreeItem is always POSIX - path=path / item.path, + + # submodule recursion + # the .path of a GitTreeItem is always POSIX + sm_path = path / item.path + if recursion_mode == 'submodules': + # in this mode, we run the submodule status against it own + # worktree head + sm_head, _ = get_worktree_head(sm_path) + # because this need not cover all possible changes with respect + # to the parent repository, we yield an item on the submodule + # itself + yield item + elif recursion_mode == 'monolithic': + # in this mode we determine the change of the submodule with + # respect to the recorded state in the parent. This is either + # the current gitsha, or (if git detected a committed + # modification) the previous sha. This way, any further report + # on changes a comprehensive from the point of view of the parent + # repository, hence no submodule item is emitted + sm_head = item.gitsha or item.prev_gitsha + + for i in _yield_hierarchy_items( + head=sm_head, + path=sm_path, untracked=untracked, - recursive='submodules', - yield_tree_items=yield_tree_items, + # TODO here we could implement handling for a recursion-depth limit + recursion_mode=recursion_mode, + eval_submodule_state=eval_submodule_state, ): i.name = f'{item.name}/{i.name}' yield i -def _yield_repo_untracked(path, untracked): +# +# Helpers +# + + +def _yield_repo_untracked( + path: Path, + untracked: str, +) -> Generator[GitDiffItem, None, None]: + """Yield items on all untracked content in a repository""" + if untracked is None: + return for uf in _git_ls_files( path, *lsfiles_untracked_args[untracked], @@ -145,3 +373,147 @@ def _yield_repo_untracked(path, untracked): name=uf, status=GitDiffStatus.other, ) + + +def _path_has_untracked(path: Path) -> bool: + """Recursively check for any untracked content (except empty dirs)""" + if not path.exists(): + # cannot possibly have untracked + return False + for ut in _yield_repo_untracked( + path, + 'no-empty-dir', + ): + # fast exit on the first detection + return True + # we need to find all submodules, regardless of mode. + # untracked content can also be in a submodule underneath + # a directory + for subm in iter_submodules(path): + if _path_has_untracked(path / subm.path): + # fast exit on the first detection + return True + # only after we saw everything we can say there is nothing + return False + + +def _get_submod_worktree_head(path: Path) -> tuple[bool, str | None, bool]: + """Returns (submodule exists, SHA | None, adjusted)""" + try: + HEAD, corresponding_head = get_worktree_head(path) + except ValueError: + return False, None, False + + adjusted = corresponding_head is not None + if adjusted: + # this is a git-annex adjusted branch. do the comparison against + # its basis. it is not meaningful to track the managed branch in + # a superdataset + HEAD = corresponding_head + with iter_git_subproc( + ['rev-parse', '--path-format=relative', + '--show-toplevel', HEAD], + cwd=path, + ) as r: + res = tuple(decode_bytes(itemize(r, sep=None, keep_ends=False))) + assert len(res) == 2 + if res[0].startswith('..'): + # this is not a report on a submodule at this location + return False, None, adjusted + else: + return True, res[1], adjusted + + +def _eval_submodule(basepath, item, eval_mode) -> None: + """In-place amend GitDiffItem submodule item + + It does nothing with ``eval_mode='no'``. + """ + if eval_mode == 'no': + return + + item_path = basepath / item.path + # get head commit, and whether a submodule is actually present, + # and/or in adjusted mode + subds_present, head_commit, adjusted = _get_submod_worktree_head(item_path) + if not subds_present: + return + + if adjusted: + _eval_submodule_adjusted(item_path, item, head_commit, eval_mode) + else: + _eval_submodule_normal(item_path, item, head_commit, eval_mode) + + +def _eval_submodule_normal(item_path, item, head_commit, eval_mode) -> None: + if eval_mode == 'full' and item.status is None or ( + item.modification_types + and GitContainerModificationType.new_commits in item.modification_types + ): + # if new commits have been detected, the diff-implementation is + # not able to report "modified content" at the same time, if it + # exists. This requires a dedicated inspection, which conincidentally + # is identical to the analysis of an adjusted mode submodule. + return _eval_submodule_adjusted( + item_path, item, head_commit, eval_mode) + + if item.gitsha != head_commit: + item.status = GitDiffStatus.modification + item.add_modification_type(GitContainerModificationType.new_commits) + + if eval_mode == 'commit': + return + + # check for untracked content (recursively) + if _path_has_untracked(item_path): + item.status = GitDiffStatus.modification + item.add_modification_type( + GitContainerModificationType.untracked_content) + + +def _eval_submodule_adjusted(item_path, item, head_commit, eval_mode) -> None: + # we cannot rely on the diff-report for a submodule in adjusted mode. + # git would make the comparison to the adjusted branch HEAD alone. + # this would almost always be invalid, because it is not meaningful to + # track a commit in an adjusted branch (it goes away). + # + # instead, we need to: + # - check for a change in the corresponding HEAD to the recorded commit + # in the parent repository, consider any change "new commits" + # - check for a diff of the worktree to corresponding HEAD, consider + # any such diff a "modified content" + # - and lastly check for untracked content + + # start with "no modification" + item.status = None + item.modification_types = None + + if item.prev_gitsha != head_commit: + item.status = GitDiffStatus.modification + item.add_modification_type(GitContainerModificationType.new_commits) + + if eval_mode == 'commit': + return + + if any( + i.status is not None + for i in iter_gitdiff( + item_path, + from_treeish=head_commit, + # worktree + to_treeish=None, + recursive='repository', + find_renames=None, + find_copies=None, + eval_submodule_state='commit', + ) + ): + item.status = GitDiffStatus.modification + item.add_modification_type( + GitContainerModificationType.modified_content) + + # check for untracked content (recursively) + if _path_has_untracked(item_path): + item.status = GitDiffStatus.modification + item.add_modification_type( + GitContainerModificationType.untracked_content) diff --git a/datalad_next/iter_collections/tests/test_itergitstatus.py b/datalad_next/iter_collections/tests/test_itergitstatus.py new file mode 100644 index 000000000..9e32f2863 --- /dev/null +++ b/datalad_next/iter_collections/tests/test_itergitstatus.py @@ -0,0 +1,349 @@ +from itertools import chain +import pytest + +from datalad_next.datasets import Dataset +from datalad_next.runners import call_git_success +from datalad_next.tests.utils import rmtree + +from ..gitstatus import ( + GitDiffStatus, + GitContainerModificationType, + iter_gitstatus, +) + + +# we make this module-scope, because we use the same complex test case for all +# tests here and we trust that nothing in here changes that test case +@pytest.fixture(scope="module") +def status_playground(tmp_path_factory): + """Produces a dataset with various modifications + + ``git status`` will report:: + + ❯ git status -uall + On branch dl-test-branch + Changes to be committed: + (use "git restore --staged ..." to unstage) + new file: dir_m/file_a + new file: file_a + + Changes not staged for commit: + (use "git add/rm ..." to update what will be committed) + (use "git restore ..." to discard changes in working directory) + (commit or discard the untracked or modified content in submodules) + deleted: dir_d/file_d + deleted: dir_m/file_d + modified: dir_m/file_m + deleted: dir_sm/sm_d + modified: dir_sm/sm_m (modified content) + modified: dir_sm/sm_mu (modified content, untracked content) + modified: dir_sm/sm_n (new commits) + modified: dir_sm/sm_nm (new commits, modified content) + modified: dir_sm/sm_nmu (new commits, modified content, untracked content) + modified: dir_sm/sm_u (untracked content) + deleted: file_d + modified: file_m + + Untracked files: + (use "git add ..." to include in what will be committed) + dir_m/dir_u/file_u + dir_m/file_u + dir_u/file_u + file_u + + Suffix indicates the ought-to state (multiple possible): + + a - added + c - clean + d - deleted + n - new commits + m - modified + u - untracked content + + Prefix indicated the item type: + + file - file + sm - submodule + dir - directory + """ + ds = Dataset(tmp_path_factory.mktemp("status_playground")) + ds.create(result_renderer='disabled') + ds_dir = ds.pathobj / 'dir_m' + ds_dir.mkdir() + ds_dir_d = ds.pathobj / 'dir_d' + ds_dir_d.mkdir() + (ds_dir / 'file_m').touch() + (ds.pathobj / 'file_m').touch() + dirsm = ds.pathobj / 'dir_sm' + dss = {} + for smname in ( + 'sm_d', 'sm_c', 'sm_n', 'sm_m', 'sm_nm', 'sm_u', 'sm_mu', 'sm_nmu', + 'droppedsm_c', + ): + sds = Dataset(dirsm / smname).create(result_renderer='disabled') + # for the plain modification, commit the reference right here + if smname in ('sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): + (sds.pathobj / 'file_m').touch() + sds.save(to_git=True, result_renderer='disabled') + dss[smname] = sds + # files in superdataset to be deleted + for d in (ds_dir_d, ds_dir, ds.pathobj): + (d / 'file_d').touch() + dss['.'] = ds + dss['dir'] = ds_dir + ds.save(to_git=True, result_renderer='disabled') + ds.drop(dirsm / 'droppedsm_c', what='datasets', reckless='availability', + result_renderer='disabled') + # a new commit + for smname in ('.', 'sm_n', 'sm_nm', 'sm_nmu'): + sds = dss[smname] + (sds.pathobj / 'file_c').touch() + sds.save(to_git=True, result_renderer='disabled') + # modified file + for smname in ('.', 'dir', 'sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): + obj = dss[smname] + pobj = obj.pathobj if isinstance(obj, Dataset) else obj + (pobj / 'file_m').write_text('modify!') + # untracked + for smname in ('.', 'dir', 'sm_u', 'sm_mu', 'sm_nmu'): + obj = dss[smname] + pobj = obj.pathobj if isinstance(obj, Dataset) else obj + (pobj / 'file_u').touch() + (pobj / 'dirempty_u').mkdir() + (pobj / 'dir_u').mkdir() + (pobj / 'dir_u' / 'file_u').touch() + # delete items + rmtree(dss['sm_d'].pathobj) + rmtree(ds_dir_d) + (ds_dir / 'file_d').unlink() + (ds.pathobj / 'file_d').unlink() + # added items + for smname in ('.', 'dir', 'sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): + obj = dss[smname] + pobj = obj.pathobj if isinstance(obj, Dataset) else obj + (pobj / 'file_a').write_text('added') + assert call_git_success(['add', 'file_a'], cwd=pobj) + + yield ds + + +def test_status_homogeneity(status_playground): + """Test things that should always be true, no matter the precise + parameterization + + A main purpose of this test is also to exercise all (main) code paths. + """ + ds = status_playground + for kwargs in ( + # default + dict(path=ds.pathobj), + dict(path=ds.pathobj, recursive='no'), + dict(path=ds.pathobj, recursive='repository'), + dict(path=ds.pathobj, recursive='submodules'), + # same as above, but with the submodules in the root + dict(path=ds.pathobj / 'dir_sm', recursive='no'), + dict(path=ds.pathobj / 'dir_sm', recursive='repository'), + dict(path=ds.pathobj / 'dir_sm', recursive='submodules'), + # no submodule state + dict(path=ds.pathobj, eval_submodule_state='no', recursive='no'), + dict(path=ds.pathobj, eval_submodule_state='no', recursive='repository'), + dict(path=ds.pathobj, eval_submodule_state='no', recursive='submodules'), + # just the commit + dict(path=ds.pathobj, eval_submodule_state='commit', recursive='no'), + dict(path=ds.pathobj, eval_submodule_state='commit', recursive='repository'), + dict(path=ds.pathobj, eval_submodule_state='commit', recursive='submodules'), + # without untracked + dict(path=ds.pathobj, untracked='no', recursive='no'), + dict(path=ds.pathobj, untracked='no', recursive='repository'), + dict(path=ds.pathobj, untracked='no', recursive='submodules'), + # special untracked modes + dict(path=ds.pathobj, untracked='whole-dir', recursive='no'), + dict(path=ds.pathobj, untracked='whole-dir', recursive='repository'), + dict(path=ds.pathobj, untracked='whole-dir', recursive='submodules'), + dict(path=ds.pathobj, untracked='no-empty-dir', recursive='no'), + dict(path=ds.pathobj, untracked='no-empty-dir', recursive='repository'), + dict(path=ds.pathobj, untracked='no-empty-dir', recursive='submodules'), + # call in the mountpoint of a dropped submodule + dict(path=ds.pathobj / 'dir_sm' / 'droppedsm_c'), + ): + st = {item.name: item for item in iter_gitstatus(**kwargs)} + # we get no report on anything clean (implicitly also tests + # whether all item names are plain strings + assert all(not i.name.endswith('_c') for i in st.values()) + + # anything untracked is labeled as such + assert all( + i.status == GitDiffStatus.other + # we would not see a submodule modification qualifier when instructed + # not to evaluate a submodule + or kwargs.get('eval_submodule_state') in ('no', 'commit') + or GitContainerModificationType.untracked_content in i.modification_types + for i in st.values() + if 'u' in i.path.name.split('_')[1] + ) + + # anything modified is labeled as such + assert all( + i.status == GitDiffStatus.modification + for i in st.values() + if 'm' in i.path.name.split('_')[1] + ) + + # anything deleted is labeled as such + assert all( + i.status == GitDiffStatus.deletion + for i in st.values() + if 'd' in i.path.name.split('_')[1] + ) + + +def test_status_invalid_params(existing_dataset): + ds = existing_dataset + with pytest.raises(ValueError): + list(iter_gitstatus(ds.pathobj, recursive='fromspace')) + + +test_cases_repository_recursion = [ + {'name': 'file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_m/file_a', 'status': GitDiffStatus.addition}, + {'name': 'file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_u/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_m/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_m/dir_u/file_u', 'status': GitDiffStatus.other}, + {'name': 'file_d', 'status': GitDiffStatus.deletion}, + {'name': 'dir_d/file_d', 'status': GitDiffStatus.deletion}, + {'name': 'dir_m/file_d', 'status': GitDiffStatus.deletion}, + {'name': 'file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_m/file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_sm/sm_d', 'status': GitDiffStatus.deletion}, + {'name': 'dir_sm/sm_n', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.new_commits,)}, + {'name': 'dir_sm/sm_m', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content,)}, + {'name': 'dir_sm/sm_nm', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content, + GitContainerModificationType.new_commits)}, + {'name': 'dir_sm/sm_nmu', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content, + GitContainerModificationType.untracked_content, + GitContainerModificationType.new_commits)}, + {'name': 'dir_sm/sm_u', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.untracked_content,)}, + {'name': 'dir_sm/sm_mu', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content, + GitContainerModificationType.untracked_content)}, +] + +test_cases_submodule_recursion = [ + {'name': 'dir_sm/sm_m/file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_sm/sm_nm/file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_sm/sm_mu/file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_sm/sm_nmu/file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_sm/sm_m/file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_sm/sm_mu/file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_sm/sm_nmu/file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_sm/sm_u/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_sm/sm_mu/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_sm/sm_nmu/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_sm/sm_u/dir_u/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_sm/sm_mu/dir_u/file_u', 'status': GitDiffStatus.other}, + {'name': 'dir_sm/sm_nmu/dir_u/file_u', 'status': GitDiffStatus.other}, +] + + +def _assert_testcases(st, tc): + for c in tc: + assert st[c['name']].status == c['status'] + mod_types = st[c['name']].modification_types + if 'qual' in c: + assert set(mod_types) == set(c['qual']) + else: + assert mod_types is None + + +def test_status_vs_git(status_playground): + """Implements a comparison against how git-status behaved when + the test was written (see fixture docstring) + """ + st = { + item.name: item + for item in iter_gitstatus( + path=status_playground.pathobj, recursive='repository', + eval_submodule_state='full', untracked='all', + ) + } + _assert_testcases(st, test_cases_repository_recursion) + + +def test_status_norec(status_playground): + st = { + item.name: item + for item in iter_gitstatus( + path=status_playground.pathobj, recursive='no', + eval_submodule_state='full', untracked='all', + ) + } + test_cases = [ + {'name': 'file_a', 'status': GitDiffStatus.addition}, + {'name': 'dir_d', 'status': GitDiffStatus.deletion}, + {'name': 'dir_m', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content, + GitContainerModificationType.untracked_content)}, + {'name': 'dir_sm', 'status': GitDiffStatus.modification, + 'qual': (GitContainerModificationType.modified_content, + GitContainerModificationType.untracked_content)}, + {'name': 'file_d', 'status': GitDiffStatus.deletion}, + {'name': 'file_m', 'status': GitDiffStatus.modification}, + {'name': 'dir_u', 'status': GitDiffStatus.other}, + {'name': 'file_u', 'status': GitDiffStatus.other}, + ] + _assert_testcases(st, test_cases) + + +def test_status_smrec(status_playground): + st = { + item.name: item + for item in iter_gitstatus( + path=status_playground.pathobj, recursive='submodules', + eval_submodule_state='full', untracked='all', + ) + } + # in this mode we expect ALL results of a 'repository' mode recursion, + # including the submodule-type items, plus additional ones from within + # the submodules + _assert_testcases(st, chain(test_cases_repository_recursion, + test_cases_submodule_recursion)) + + +def test_status_monorec(status_playground): + st = { + item.name: item + for item in iter_gitstatus( + path=status_playground.pathobj, recursive='monolithic', + eval_submodule_state='full', untracked='all', + ) + } + # in this mode we expect ALL results of a 'repository' mode recursion, + # including the submodule-type items, plus additional ones from within + # the submodules + _assert_testcases( + st, + # repository and recursive test cases, minus any direct submodule + # items + [c for c in chain(test_cases_repository_recursion, + test_cases_submodule_recursion) + if not c['name'].split('/')[-1].split('_')[0] == 'sm']) + + +def test_status_gitinit(tmp_path): + # initialize a fresh git repo, but make no commits + assert call_git_success(['init'], cwd=tmp_path) + for recmode in ('no', 'repository', 'submodules'): + assert [] == list(iter_gitstatus(tmp_path, recursive=recmode)) + # untracked reporting must be working normal + (tmp_path / 'untracked').touch() + for recmode in ('no', 'repository', 'submodules'): + res = list(iter_gitstatus(tmp_path, recursive=recmode)) + assert len(res) == 1 + assert res[0].name == 'untracked' + assert res[0].status == GitDiffStatus.other diff --git a/docs/source/api.rst b/docs/source/api.rst index bd4fa567c..f75f34a2b 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -9,4 +9,5 @@ High-level API commands credentials download ls_file_collection + next_status tree diff --git a/docs/source/cmd.rst b/docs/source/cmd.rst index ce3d3a4c3..3bae9c82c 100644 --- a/docs/source/cmd.rst +++ b/docs/source/cmd.rst @@ -8,4 +8,5 @@ Command line reference generated/man/datalad-credentials generated/man/datalad-download generated/man/datalad-ls-file-collection + generated/man/datalad-next-status generated/man/datalad-tree