From 0c83a307cdcb8d9d0f2e6dae171c8313cabf8e63 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 8 Jun 2022 21:41:36 +0200 Subject: [PATCH 001/131] Port test suite to pytest --- .appveyor.yml | 4 +- .github/workflows/test_crippledfs.yml | 2 +- datalad_next/__init__.py | 3 - datalad_next/backend/tests/test_base.py | 2 +- datalad_next/conftest.py | 16 ++++ .../gitremote/tests/test_datalad_annex.py | 14 +-- .../tests/test_create_sibling_ghlike.py | 2 +- datalad_next/patches/tests/test_push.py | 4 +- .../tests/test_push_to_export_remote.py | 2 +- .../tests/test_create_sibling_webdav.py | 91 +++++++------------ datalad_next/tests/test_credentials.py | 2 +- datalad_next/tests/test_credman.py | 4 +- datalad_next/tests/test_utils.py | 6 +- datalad_next/tests/utils.py | 2 +- requirements-devel.txt | 10 +- setup.cfg | 4 +- 16 files changed, 77 insertions(+), 91 deletions(-) create mode 100644 datalad_next/conftest.py diff --git a/.appveyor.yml b/.appveyor.yml index 284beb6e..f9d0a0a8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -193,8 +193,8 @@ test_script: - sh: mkdir __testhome__ - cd __testhome__ # run test selecion (--traverse-namespace needed from Python 3.8 onwards) - - cmd: python -m nose --traverse-namespace -s -v -A "not (turtle)" --with-cov --cover-package datalad_next %DTS% - - sh: PATH=$PWD/../tools/coverage-bin:$PATH python -m nose --traverse-namespace -s -v -A "not (turtle)" --with-cov --cover-package datalad_next ${DTS} + - cmd: python -m pytest -s -v -m "not (turtle)" --cov=datalad_next --pyargs %DTS% + - sh: PATH=$PWD/../tools/coverage-bin:$PATH python -m pytest -s -v -m "not (turtle)" --cov=datalad_next --pyargs ${DTS} after_test: diff --git a/.github/workflows/test_crippledfs.yml b/.github/workflows/test_crippledfs.yml index 819eec90..dddd22e6 100644 --- a/.github/workflows/test_crippledfs.yml +++ b/.github/workflows/test_crippledfs.yml @@ -53,7 +53,7 @@ jobs: echo "== mount >>" mount echo "<< mount ==" - PATH=$PWD/../tools/coverage-bin:$PATH python -m nose -s -v --with-doctest --with-coverage --cover-package datalad_next datalad_next + PATH=$PWD/../tools/coverage-bin:$PATH python -m pytest -s -v --doctest-modules --cov=datalad_next --pyargs datalad_next - name: Report coverage run: | python -m coverage combine -a /tmp/.coverage-entrypoints-*; diff --git a/datalad_next/__init__.py b/datalad_next/__init__.py index 6a3260c1..4d819e75 100644 --- a/datalad_next/__init__.py +++ b/datalad_next/__init__.py @@ -65,9 +65,6 @@ ) -from datalad import setup_package -from datalad import teardown_package - from ._version import get_versions __version__ = get_versions()['version'] del get_versions diff --git a/datalad_next/backend/tests/test_base.py b/datalad_next/backend/tests/test_base.py index 615a7161..f14cbc40 100644 --- a/datalad_next/backend/tests/test_base.py +++ b/datalad_next/backend/tests/test_base.py @@ -1,7 +1,7 @@ import logging import io -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_raises, eq_, ) diff --git a/datalad_next/conftest.py b/datalad_next/conftest.py new file mode 100644 index 00000000..7c727c32 --- /dev/null +++ b/datalad_next/conftest.py @@ -0,0 +1,16 @@ +try: + from datalad.conftest import setup_package +except ImportError: + # assume old datalad without pytest support introduced in + # https://github.com/datalad/datalad/pull/6273 + import pytest + from datalad import setup_package as _setup_package + from datalad import teardown_package as _teardown_package + + + @pytest.fixture(autouse=True, scope="session") + def setup_package(): + _setup_package() + yield + _teardown_package() + diff --git a/datalad_next/gitremote/tests/test_datalad_annex.py b/datalad_next/gitremote/tests/test_datalad_annex.py index 014a0a7b..db50b6ad 100644 --- a/datalad_next/gitremote/tests/test_datalad_annex.py +++ b/datalad_next/gitremote/tests/test_datalad_annex.py @@ -19,7 +19,7 @@ clone, ) from datalad.runner import CommandError -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( DEFAULT_BRANCH, DEFAULT_REMOTE, assert_raises, @@ -64,7 +64,7 @@ def eq_dla_branch_state(state, path, branch=DEFAULT_BRANCH): @skip_if_on_windows @with_tempfile @with_tempfile(mkdir=True) -def test_annex_remote(dspath, remotepath): +def test_annex_remote(dspath=None, remotepath=None): # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none' \ @@ -76,7 +76,7 @@ def test_annex_remote(dspath, remotepath): @with_tempfile @with_tempfile(mkdir=True) -def test_export_remote(dspath, remotepath): +def test_export_remote(dspath=None, remotepath=None): # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none&exporttree=yes' \ @@ -194,7 +194,7 @@ def _check_push_fetch_cycle(ds, remoteurl, remotepath, localtargetpath, probepat @with_tempfile @with_tempfile(mkdir=True) -def test_annex_remote_autorepush(dspath, remotepath): +def test_annex_remote_autorepush(dspath=None, remotepath=None): # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none' \ @@ -206,7 +206,7 @@ def test_annex_remote_autorepush(dspath, remotepath): @with_tempfile @with_tempfile(mkdir=True) -def test_export_remote_autorepush(dspath, remotepath): +def test_export_remote_autorepush(dspath=None, remotepath=None): # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none&exporttree=yes' \ @@ -319,7 +319,7 @@ def _check_typeweb(pushtmpl, clonetmpl, export, url, preppath, clonepath): @with_tempfile(mkdir=True) @serve_path_via_http @with_tempfile -def test_submodule_url(servepath, url, workdir): +def test_submodule_url(servepath=None, url=None, workdir=None): workdir = Path(workdir) # a future subdataset that we want to register under a complex URL tobesubds = Dataset(workdir / 'subdsprep').create(annex=False, result_renderer='disabled') @@ -361,7 +361,7 @@ def test_submodule_url(servepath, url, workdir): @with_tempfile @with_tempfile @serve_path_via_webdav(auth=webdav_cred) -def test_webdav_auth(preppath, clnpath, remotepath, webdavurl): +def test_webdav_auth(preppath=None, clnpath=None, remotepath=None, webdavurl=None): # this is the dataset we want to roundtrip through webdav ds = Dataset(preppath).create(annex=False, result_renderer='disabled') diff --git a/datalad_next/patches/tests/test_create_sibling_ghlike.py b/datalad_next/patches/tests/test_create_sibling_ghlike.py index 7b452595..5da27463 100644 --- a/datalad_next/patches/tests/test_create_sibling_ghlike.py +++ b/datalad_next/patches/tests/test_create_sibling_ghlike.py @@ -17,7 +17,7 @@ # we overwrite this one from core, because it assumed the old credential # system to be used @with_tempfile -def test_invalid_call(path): +def test_invalid_call(path=None): # no dataset assert_raises(ValueError, create_sibling_gin, 'bogus', dataset=path) ds = Dataset(path).create() diff --git a/datalad_next/patches/tests/test_push.py b/datalad_next/patches/tests/test_push.py index a737ad49..b3f7ad55 100644 --- a/datalad_next/patches/tests/test_push.py +++ b/datalad_next/patches/tests/test_push.py @@ -1,4 +1,4 @@ -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( DEFAULT_REMOTE, assert_result_count, with_tempfile, @@ -14,7 +14,7 @@ # value, because our implementation behaves "better" @with_tempfile() @with_tempfile() -def test_gh1811(srcpath, clonepath): +def test_gh1811(srcpath=None, clonepath=None): # `annex=false` is the only change from the -core implementation # of the test. For normal datasets with an annex, the problem underlying # gh1811 is no longer valid, because of more comprehensive analysis of diff --git a/datalad_next/patches/tests/test_push_to_export_remote.py b/datalad_next/patches/tests/test_push_to_export_remote.py index b9c98b7c..3b0ef55e 100644 --- a/datalad_next/patches/tests/test_push_to_export_remote.py +++ b/datalad_next/patches/tests/test_push_to_export_remote.py @@ -7,7 +7,7 @@ ) from datalad.runner.exception import CommandError -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( SkipTest, assert_false, assert_in, diff --git a/datalad_next/tests/test_create_sibling_webdav.py b/datalad_next/tests/test_create_sibling_webdav.py index ff7b8994..86296ed4 100644 --- a/datalad_next/tests/test_create_sibling_webdav.py +++ b/datalad_next/tests/test_create_sibling_webdav.py @@ -6,7 +6,7 @@ from urllib.parse import quote as urlquote from datalad.cli.tests.test_main import run_main -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_equal, assert_in, assert_in_results, @@ -16,10 +16,7 @@ eq_, ok_, ) -# TODO find a replacement for this in anticipation of nose->pytest -from nose.tools import ( - assert_raises_regexp, -) +import pytest from datalad.api import ( clone, @@ -28,7 +25,7 @@ from datalad.distribution.dataset import ( Dataset, ) -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( with_tempfile, with_tree ) @@ -155,7 +152,7 @@ def check_common_workflow( @with_tempfile -def test_bad_url_catching(path): +def test_bad_url_catching(path=None): # Ensure that bad URLs are detected and handled ds = Dataset(path).create() @@ -179,17 +176,13 @@ def test_bad_url_catching(path): ] for bad_url, expected_message in check_pairs: - assert_raises_regexp( - ValueError, - expected_message.format(url=bad_url), - create_sibling_webdav, - dataset=ds, - url=bad_url - ) + with pytest.raises(ValueError, + match=expected_message.format(url=bad_url)): + create_sibling_webdav(dataset=ds, url=bad_url) @with_tempfile -def test_http_warning(path): +def test_http_warning(path=None): # Check that usage of http: triggers a warning. ds = Dataset(path).create() @@ -204,12 +197,10 @@ def test_http_warning(path): # We set up the mocks to generate the following exception. This allows # us to limit the test to the logic in 'create_sibling_wabdav'. - assert_raises_regexp( - ValueError, - f"No suitable credential for {url} found or specified", - create_sibling_webdav, - dataset=ds, - url=url) + with pytest.raises( + ValueError, + match=f"No suitable credential for {url} found or specified"): + create_sibling_webdav(dataset=ds, url=url) eq_(lgr_mock.warning.call_count, 1) assert_in( @@ -221,22 +212,19 @@ def test_http_warning(path): @with_tempfile -def test_constraints_checking(path): +def test_constraints_checking(path=None): # Ensure that constraints are checked internally ds = Dataset(path).create() url = "http://localhost:22334/abc" for key in ("existing", "mode"): - assert_raises_regexp( - ValueError, "value is not one of", - create_sibling_webdav, - dataset=ds, - url=url, - **{key: "illegal-value"}) + with pytest.raises(ValueError, match="value is not one of"): + create_sibling_webdav( + dataset=ds, url=url, **{key: "illegal-value"}) @with_tempfile -def test_credential_handling(path): +def test_credential_handling(path=None): ds = Dataset(path).create() url = "https://localhost:22334/abc" @@ -247,25 +235,19 @@ def test_credential_handling(path): csw_mock.return_value = iter([]) gur_mock.return_value = None - assert_raises_regexp( - ValueError, - f"No suitable credential for {url} found or specified", - create_sibling_webdav, - dataset=ds, - url=url, - name="some_name", - existing="error") + with pytest.raises( + ValueError, + match=f"No suitable credential for {url} found or specified"): + create_sibling_webdav( + dataset=ds, url=url, name="some_name", existing="error") gur_mock.reset_mock() gur_mock.return_value = [None, {"some_key": "some_value"}] - assert_raises_regexp( - ValueError, - f"No suitable credential for {url} found or specified", - create_sibling_webdav, - dataset=ds, - url=url, - name="some_name", - existing="error") + with pytest.raises( + ValueError, + match=f"No suitable credential for {url} found or specified"): + create_sibling_webdav( + dataset=ds, url=url, name="some_name", existing="error") # Ensure that failed credential storing is handled and logged gur_mock.reset_mock() @@ -278,24 +260,19 @@ def test_credential_handling(path): @with_tempfile -def test_name_clash_detection(path): +def test_name_clash_detection(path=None): # Ensure that constraints are checked internally ds = Dataset(path).create() url = "http://localhost:22334/abc" for mode in ("annex", 'filetree', 'annex-only', 'filetree-only'): - assert_raises_regexp( - ValueError, "sibling names must not be equal", - create_sibling_webdav, - dataset=ds, - url=url, - name="abc", - storage_name="abc", - mode=mode) + with pytest.raises(ValueError, match="sibling names must not be equal"): + create_sibling_webdav( + dataset=ds, url=url, name="abc", storage_name="abc", mode=mode) @with_tempfile -def test_unused_storage_name_warning(path): +def test_unused_storage_name_warning(path=None): # Ensure that constraints are checked internally ds = Dataset(path).create() @@ -378,7 +355,7 @@ def query(self, *args, **kwargs): 'f3': '3'}) @with_tempfile @serve_path_via_webdav(auth=webdav_cred[1:]) -def test_existing_switch(localpath, remotepath, url): +def test_existing_switch(localpath=None, remotepath=None, url=None): ca = dict(result_renderer='disabled') ds = Dataset(localpath).create(force=True, **ca) # use a tricky name: '3f7' will be the hashdir of the XDLRA @@ -525,7 +502,7 @@ def test_existing_switch(localpath, remotepath, url): @with_tempfile @with_tempfile @serve_path_via_webdav(auth=webdav_cred[1:]) -def test_result_renderer(localpath, remotepath, url): +def test_result_renderer(localpath=None, remotepath=None, url=None): ca = dict(result_renderer='disabled') ds = Dataset(localpath).create(**ca) # need to amend the test credential, can only do after we know the URL diff --git a/datalad_next/tests/test_credentials.py b/datalad_next/tests/test_credentials.py index 341cba2e..41dd8e66 100644 --- a/datalad_next/tests/test_credentials.py +++ b/datalad_next/tests/test_credentials.py @@ -18,7 +18,7 @@ normalize_specs, ) from datalad.support.keyring_ import MemoryKeyring -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_in, assert_in_results, assert_raises, diff --git a/datalad_next/tests/test_credman.py b/datalad_next/tests/test_credman.py index 1a3d7a6a..d66574fe 100644 --- a/datalad_next/tests/test_credman.py +++ b/datalad_next/tests/test_credman.py @@ -18,7 +18,7 @@ _get_cred_cfg_var, ) from datalad.support.keyring_ import MemoryKeyring -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_in, assert_not_in, assert_raises, @@ -105,7 +105,7 @@ def check_credmanager(): @with_tempfile -def test_credman_local(path): +def test_credman_local(path=None): ds = Dataset(path).create(result_renderer='disabled') credman = CredentialManager(ds.config) diff --git a/datalad_next/tests/test_utils.py b/datalad_next/tests/test_utils.py index 39b6b737..114dac48 100644 --- a/datalad_next/tests/test_utils.py +++ b/datalad_next/tests/test_utils.py @@ -1,7 +1,7 @@ from pathlib import Path from webdav3.client import Client as DAVClient -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( ok_, with_tempfile, ) @@ -13,7 +13,7 @@ @with_tempfile @with_tempfile @serve_path_via_webdav(auth=webdav_cred) -def test_serve_webdav(localpath, remotepath, url): +def test_serve_webdav(localpath=None, remotepath=None, url=None): webdav_cfg = dict( webdav_hostname=url, webdav_login=webdav_cred[0], @@ -30,7 +30,7 @@ def test_serve_webdav(localpath, remotepath, url): @with_tempfile @with_tempfile @serve_path_via_webdav -def test_serve_webdav_noauth(localpath, remotepath, url): +def test_serve_webdav_noauth(localpath=None, remotepath=None, url=None): webdav_cfg = dict( webdav_hostname=url, webdav_root='/', diff --git a/datalad_next/tests/utils.py b/datalad_next/tests/utils.py index ca851b29..20667f40 100644 --- a/datalad_next/tests/utils.py +++ b/datalad_next/tests/utils.py @@ -3,7 +3,7 @@ from pathlib import Path from datalad.utils import optional_args -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( SkipTest, attr, ) diff --git a/requirements-devel.txt b/requirements-devel.txt index 83c86eee..06056893 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -1,11 +1,5 @@ # requirements for a development environment -nose -coverage +# (also) to get the docs built properly by RTD +-e .[devel] sphinx sphinx_rtd_theme -# for webdav testing -cheroot -wsgidav -webdavclient3 -# to get the docs built properly by RTD --e . diff --git a/setup.cfg b/setup.cfg index 40b449cc..ed487aa6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,8 +22,10 @@ include_package_data = True [options.extras_require] # this matches the name used by -core and what is expected by some CI setups devel = - nose + pytest + pytest-cov coverage + # for webdav testing cheroot wsgidav webdavclient3 From fcd846d226510e584a386072c74e574f9d9aaf9c Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 8 Jun 2022 21:51:20 +0200 Subject: [PATCH 002/131] Temporarily depend on datalad-not-yet-0.17 --- requirements-devel.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-devel.txt b/requirements-devel.txt index 06056893..4e8e4c86 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -1,5 +1,7 @@ # requirements for a development environment # (also) to get the docs built properly by RTD -e .[devel] +# we need datalad from the future with pytest foo for now +git+https://github.com/datalad/datalad sphinx sphinx_rtd_theme From 501b12457f2b10f20c549270f9de9537832c2d02 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 29 Jun 2022 19:54:15 +0200 Subject: [PATCH 003/131] register datalad tree command, dummy implementation --- datalad_next/__init__.py | 6 ++ datalad_next/tests/test_register.py | 1 + datalad_next/tree.py | 102 ++++++++++++++++++++++++++++ docs/source/index.rst | 2 + 4 files changed, 111 insertions(+) create mode 100644 datalad_next/tree.py diff --git a/datalad_next/__init__.py b/datalad_next/__init__.py index 4d819e75..39710ed2 100644 --- a/datalad_next/__init__.py +++ b/datalad_next/__init__.py @@ -28,6 +28,12 @@ # not pick it up, due to the dashes in the name 'create-sibling-webdav', ), + ( + # importable module that contains the command implementation + 'datalad_next.tree', + # name of the command class implementation in above module + 'Tree', + ) ] ) diff --git a/datalad_next/tests/test_register.py b/datalad_next/tests/test_register.py index dad43de1..ad7382f7 100644 --- a/datalad_next/tests/test_register.py +++ b/datalad_next/tests/test_register.py @@ -2,3 +2,4 @@ def test_register(): import datalad.api as da assert hasattr(da, 'credentials') + assert hasattr(da, 'tree') diff --git a/datalad_next/tree.py b/datalad_next/tree.py new file mode 100644 index 00000000..d1018c62 --- /dev/null +++ b/datalad_next/tree.py @@ -0,0 +1,102 @@ +# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 noet: +# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +# +# See LICENSE file distributed along with the datalad_osf package for the +# copyright and license terms. +# +# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +"""'tree'-like command for visualization of dataset hierarchies""" + +__docformat__ = 'restructuredtext' + +import json +import logging + +from datalad.interface.base import ( + Interface, + build_doc, +) +from datalad.support.exceptions import CapturedException +from datalad.support.param import Parameter +from datalad.distribution.dataset import ( + datasetmethod, + EnsureDataset, + require_dataset, +) +from datalad.interface.results import ( + get_status_dict, +) +from datalad.interface.utils import ( + eval_results, + generic_result_renderer, +) +from datalad.support.constraints import ( + EnsureChoice, + EnsureNone, + EnsureStr, EnsureInt, +) + +lgr = logging.getLogger('datalad.local.tree') + + +@build_doc +class Tree(Interface): + """Visualize dataset hierarchy trees + + This command mimics the UNIX/MSDOS 'tree' command to display directory + trees, highlighting DataLad datasets in the hierarchy. + + """ + result_renderer = 'tailored' + + _params_ = dict( + dataset=Parameter( + args=("-d", "--dataset"), + doc="""specify a dataset for which to generate the + directory tree. If no dataset is given, will generate the + tree starting from the current directory.""", + constraints=EnsureDataset() | EnsureNone()), + path=Parameter( + args=("path",), + nargs='?', + doc="""path to directory from which to generate the tree. + If empty, will generate the tree starting from the current + directory.""", + constraints=EnsureStr() | EnsureNone()), + level=Parameter( + args=("-L", "--level",), + doc="""maximum depth for dataset/directory tree""", + constraints=EnsureInt() | EnsureNone()), + # TODO: + # --include-files (only lists directories by default) + # --full-paths (equivalent of 'tree -f') + ) + + _examples_ = [ + dict( + text="Display first-level subdirectories of the current directory, " + "with markers highlighting datasets", + code_py="tree('.')", + code_cmd="datalad tree -L 1"), + dict(text="Display the full dataset hierarchy from the current dataset, " + "only showing directories that are datasets", + code_py="tree(dataset='.', full_paths=True)", + code_cmd="datalad tree -d . --full-paths"), + ] + + @staticmethod + @datasetmethod(name='tree') + @eval_results + def __call__(path='.', dataset=None, *, level=None): + + ds = require_dataset( + dataset, + check_installed=True, + purpose='display dataset hierarchy tree') + + yield get_status_dict( + action='tree', + status='ok', + ds=ds, + ) \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 6bcebdd4..35602640 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,6 +22,7 @@ High-level API commands create_sibling_webdav credentials + tree Command line reference @@ -32,6 +33,7 @@ Command line reference generated/man/datalad-create-sibling-webdav generated/man/datalad-credentials + generated/man/datalad-tree Python utilities From 3cdeb2f32557eb78761090aa5ea79ca832e6db11 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 3 Jul 2022 17:59:28 +0200 Subject: [PATCH 004/131] bulk of implementation with 2 passing tests --- datalad_next/tests/test_tree.py | 91 +++++++++ datalad_next/tree.py | 320 ++++++++++++++++++++++++++++---- 2 files changed, 376 insertions(+), 35 deletions(-) create mode 100644 datalad_next/tests/test_tree.py diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py new file mode 100644 index 00000000..2cf4e4da --- /dev/null +++ b/datalad_next/tests/test_tree.py @@ -0,0 +1,91 @@ +import os + +from datalad.tests.utils_pytest import ( + assert_in, + assert_not_in, + assert_raises, + eq_, + neq_, + patch_config, + with_tree, +) +from ..tree import Tree, Walk + +""" +Tests for datalad tree. +TODO: create fixture for precomputing tree output for a given set +of parameters, so it can be reused in multiple tests. +""" + +# directory layout to be tested that will be created as temp dir. +# directory index is equals to the count of its subdirectories +# and the (count-1) of files contained underneath it. +# TODO: generate programmatically instead of hardcoding +# (though it's easier to visualize if hardcoded) +_temp_dir_tree = { + "root": { + "dir0": {}, # empty dir + "dir1": { + "dir1_file0": 'tempfile', + }, + "dir2": { + "dir2_dir0": {}, + "dir2_dir1": { + "dir2_dir1_file0": 'tempfile', + }, + "dir2_dir2": { + "dir2_dir2_file0": 'tempfile', + "dir2_dir2_file1": 'tempfile', + }, + "dir2_file0": 'tempfile', + "dir2_file1":'tempfile' + }, + "file0": 'tempfile', + "file1": 'tempfile', + } +} + + +@with_tree(_temp_dir_tree) +def test_build_tree_dirs_only(path=None): + root = os.path.join(path, 'root') + walk = Walk(root, max_depth=3, include_files=False) + walk.build_tree() + actual = walk.get_tree() + + expected = f"""{root} +├── dir0 +├── dir1 +└── dir2 + ├── dir2_dir0 + ├── dir2_dir1 + └── dir2_dir2 +""" + eq_(expected, actual) + + +@with_tree(_temp_dir_tree) +def test_build_tree_including_files(path=None): + root = os.path.join(path, 'root') + walk = Walk(root, max_depth=3, include_files=True) + walk.build_tree() + actual = walk.get_tree() + + expected = f"""{root} +├── file0 +├── file1 +├── dir0 +├── dir1 +| └── dir1_file0 +└── dir2 + ├── dir2_file0 + ├── dir2_file1 + ├── dir2_dir0 + ├── dir2_dir1 + | └── dir2_dir1_file0 + └── dir2_dir2 + ├── dir2_dir2_file0 + └── dir2_dir2_file1 +""" + eq_(expected, actual) + diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d1018c62..f5105250 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -12,12 +12,13 @@ import json import logging +import os from datalad.interface.base import ( Interface, build_doc, ) -from datalad.support.exceptions import CapturedException +from datalad.support.exceptions import CapturedException, NoDatasetFound from datalad.support.param import Parameter from datalad.distribution.dataset import ( datasetmethod, @@ -34,7 +35,7 @@ from datalad.support.constraints import ( EnsureChoice, EnsureNone, - EnsureStr, EnsureInt, + EnsureStr, EnsureInt, EnsureBool, EnsureRange, Constraints, ) lgr = logging.getLogger('datalad.local.tree') @@ -42,61 +43,310 @@ @build_doc class Tree(Interface): - """Visualize dataset hierarchy trees + """Visualize directory and dataset hierarchies - This command mimics the UNIX/MSDOS 'tree' command to display directory - trees, highlighting DataLad datasets in the hierarchy. + This command mimics the UNIX/MSDOS 'tree' command to display a + directory tree, highlighting DataLad datasets in the hierarchy. """ result_renderer = 'tailored' _params_ = dict( - dataset=Parameter( - args=("-d", "--dataset"), - doc="""specify a dataset for which to generate the - directory tree. If no dataset is given, will generate the - tree starting from the current directory.""", - constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), nargs='?', doc="""path to directory from which to generate the tree. - If empty, will generate the tree starting from the current - directory.""", + Defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), - level=Parameter( - args=("-L", "--level",), - doc="""maximum depth for dataset/directory tree""", - constraints=EnsureInt() | EnsureNone()), - # TODO: - # --include-files (only lists directories by default) - # --full-paths (equivalent of 'tree -f') + depth=Parameter( + args=("-L", "--depth",), + doc="""maximum tree depth to display. Can refer to either + directory depth or dataset hierarchy depth, depending on + the value of [CMD: --depth-mode CMD][PY: `depth_mode` PY].""", + constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone()), + depth_mode=Parameter( + args=("--depth-mode",), + doc="""interpret [CMD: --depth CMD][PY: `depth` PY] parameter to mean either + directory depth or subdataset hierarchy depth.""", + constraints=EnsureChoice("directory", "dataset")), + datasets_only=Parameter( + args=("--datasets-only",), + doc="""whether to only list directories that are datasets""", + action='store_true'), + include_files=Parameter( + args=("--include-files",), + doc="""whether to include files in output display""", + action='store_true'), + include_hidden=Parameter( + args=("-a", "--include-hidden",), + doc="""whether to include hidden files/directories in output display""", + action='store_true'), + full_paths=Parameter( + args=("--full-paths",), + doc="""whether to display full paths""", + action='store_true'), ) _examples_ = [ dict( - text="Display first-level subdirectories of the current directory, " - "with markers highlighting datasets", - code_py="tree('.')", - code_cmd="datalad tree -L 1"), - dict(text="Display the full dataset hierarchy from the current dataset, " - "only showing directories that are datasets", - code_py="tree(dataset='.', full_paths=True)", - code_cmd="datalad tree -d . --full-paths"), + text="Display up to 3 levels of subdirectories and their " + "contents starting from the current directory", + code_py="tree(depth=3, include_files=True)", + code_cmd="datalad tree -L 3 --include-files"), + dict(text="List all first- and second-level subdatasets " + "of datasets located anywhere under /tmp (regardless " + "of directory depth), displaying their full paths", + code_py="tree('/tmp', depth=2, depth_mode='dataset', datasets_only=True, full_paths=True)", + code_cmd="datalad tree /tmp -L 2 --depth-mode dataset --datasets-only --full-paths"), ] @staticmethod @datasetmethod(name='tree') @eval_results - def __call__(path='.', dataset=None, *, level=None): + def __call__(path='.', *, depth=None, depth_mode='directory', + datasets_only=False, include_files=False, include_hidden=False, full_paths=False): - ds = require_dataset( - dataset, - check_installed=True, - purpose='display dataset hierarchy tree') + # print tree output + walk = Walk(path, depth, datasets_only=datasets_only, include_files=include_files) + walk.build_tree() + print(walk.get_tree()) + print(walk.stats()) + # return a generic OK status yield get_status_dict( action='tree', status='ok', - ds=ds, - ) \ No newline at end of file + path=path, + ) + + +class Walk(object): + + def __init__(self, root: str, max_depth: int, + datasets_only=False, include_files=False, + include_hidden=False, full_paths=False): + if not os.path.isdir(root): + raise ValueError(f"directory '{root}' not found") + self.root = root + self.max_depth = max_depth + self.datasets_only = datasets_only + self.include_files = include_files + self.include_hidden = include_hidden + self.full_paths = full_paths + self._output = "" + self._last_children = [] + self._stats = {'dir_count': 0, 'file_count': 0, 'dataset_count': 0} + + def get_tree(self): + return self._output + + def _current_depth(self, path: str): + """Directory depth of current path relative to root of the walk""" + # directory depth can be safely inferred from the number of + # path separators in path, since pathsep characters are illegal + # in file or directory names. + return path.count(os.path.sep) - self.root.rstrip(os.path.sep).count(os.path.sep) + + def _is_last_child(self, path): + """Whether an item is the last child within its subtree""" + return path in self._last_children + + def _is_max_depth_reached(self, path): + """ + If max depth is reached, it means we will not traverse + any further directories in the next iteration. + However, we will still list any directories or files + below the current level. + Therefore, we 'reach' when we get 1 level before max_depth. + """ + return self._current_depth(path) == self.max_depth - 1 + + def stats(self): + """Equivalent of tree command's 'report line'. + TODO: add dataset count""" + return f"{self._stats}\n" + + def generate_tree_items(self): + """Generator of directories/files, traversed in depth-first order.""" + for path, dirs, files in os.walk(self.root): + + # exclude hidden files/directories unless specified by arg. + # we modify os.walk's output in-place. + if not self.include_hidden: + dirs[:] = [d for d in dirs if not d.startswith(".")] + files[:] = [f for f in files if not f.startswith(".")] + + # sort directories and files alphabetically in-place + dirs.sort() + files.sort() + + # check if item is the last child within its subtree + # (needed for applying special formatting) + if dirs or files: # if there is a next level + # files are listed first, directories come last. + # so we take the last subdirectory if it exists, + # otherwise the last file. + self._last_children.append( + os.path.join(path, dirs[-1] if dirs else files[-1]) + ) + + current_depth = self._current_depth(path) + item = DirectoryItem(path, current_depth, self._is_last_child(path)) + + if not self.datasets_only or self.datasets_only and item.is_dataset(): + yield item + if current_depth > 0: + self._stats['dir_count'] += 1 # do not count root directory + + if self.include_files: + for file in files: + file_path = os.path.join(path, file) + yield FileItem(file_path, current_depth + 1, + self._is_last_child(file_path)) + self._stats['file_count'] += 1 + + if self._is_max_depth_reached(path): + # generate any remaining directory items, which + # will not be traversed + for child_dir in dirs: + dir_path = os.path.join(path, child_dir) + yield DirectoryItem(dir_path, current_depth + 1, + self._is_last_child(dir_path)) + self._stats['dir_count'] += 1 + + # empty in-place the list of next directories + # to traverse. this effectively stops os.walk's walking. + dirs[:] = [] + + def build_tree(self): + """ + Structure of tree output line (express in BNF?): + [padding]?[prefix]?[path] + Example: + `| | |– mydir` + """ + # keep track of levels where subtree is exhaused, + # i.e. we have reached the last child of the subtree. + # this is needed to build the padding string for each item, + # which takes into account whether any parent + # is the last item of its own subtree. + levels_with_exhausted_subtree = set([]) + + for item in self.generate_tree_items(): + lgr.debug(item) + + if item.is_last_child: # last child of its subtree + levels_with_exhausted_subtree.add(item.depth) + else: + # 'discard' does not raise exception + # if value does not exist in set + levels_with_exhausted_subtree.discard(item.depth) + + path = item.path # display of item path + padding = "" # vertical lines for continuing the parent subtrees + prefix = "" # single indentation symbol for the given item + + if item.depth > 0: + # for non-root items, display the basename + path = os.path.basename(item.path) + + # build padding string + padding_symbols_for_levels = [ + "| " + if level not in levels_with_exhausted_subtree + else " " + for level in range(1, item.depth) + ] + padding = ''.join(padding_symbols_for_levels) + + # set prefix + if item.is_last_child: + prefix = "└── " + else: + prefix = "├── " + + self._output += (padding + prefix + path + "\n") + + +class DirectoryWalk(Walk): + """ + Traverse a hierarchy of directories. + In this context, 'depth' means directory depth. + """ + pass + + +class DatasetWalk(Walk): + """ + Traverse a hierarchy of DataLad datasets and subdatasets. + In this context, 'depth' means level of subdataset nesting + (only for datasets installed as subdatasets). + Considers only proper DataLad datasets (with a dataset ID), + not regular git/git-annex repos. + """ + + @staticmethod + def _current_subdataset_depth(path): + """Subdataset level relative to the root path. + For example, if building the tree starting from a direct + subdataset of a top-level parent dataset, will return + depth 0 for the subdataset root, depth 1 for the + sub-subdataset, etc.""" + + # TODO: make sure we consider datasets only strictly datalad + # datasets, not any git repo (may be confusing for users) + return 0 + + def _is_max_depth_reached(self, path): + return self._current_subdataset_depth(path) > self.max_depth + + +class _TreeItem(object): + """ + Base class for a directory or file represented in a single + line of the 'tree' output. + """ + + def __init__(self, path: str, depth: int, is_last_child): + self.path = path + self.depth = depth # directory depth + self.is_last_child = is_last_child # if it is last item of its subtree + + def __str__(self): + return self.path + + def format(self): + raise NotImplementedError("implemented by subclasses") + + +class DirectoryItem(_TreeItem): + def is_dataset(self): + try: + ds = require_dataset(self.path, check_installed=True) + return ds.id is not None + except (NoDatasetFound, AttributeError): + return False + + +class FileItem(_TreeItem): + pass + + +class DatasetItem(_TreeItem): + def __init__(self, *args, abs_subds_depth=None, **kwargs): + # absolute subdataset depth: + # if None, it is not a dataset (or it is a . + # if 0, it is a top-level dataset. + self.abs_subds_depth = abs_subds_depth + super().__init__(*args, **kwargs) + + def _absolute_subdataset_depth(self, path): + """Subdataset level in the context of the full dataset + hierarchy. + For example, if building the tree starting from a direct + subdataset of a top-level parent dataset, will return depth 1 + for the subdataset, depth 2 for the sub-subdataset, etc.""" + + # TODO: check how recursion levels are handled e.g. in datalad status + pass + From 089226b89b15faa8adc692ba14b4a9c9bde41cb1 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 3 Jul 2022 18:03:53 +0200 Subject: [PATCH 005/131] clean up docstrings / comments --- datalad_next/tree.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index f5105250..d5f5d739 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -145,6 +145,7 @@ def _current_depth(self, path: str): # directory depth can be safely inferred from the number of # path separators in path, since pathsep characters are illegal # in file or directory names. + # TODO: sanitize / normalize root path in Walk constructor return path.count(os.path.sep) - self.root.rstrip(os.path.sep).count(os.path.sep) def _is_last_child(self, path): @@ -156,18 +157,20 @@ def _is_max_depth_reached(self, path): If max depth is reached, it means we will not traverse any further directories in the next iteration. However, we will still list any directories or files - below the current level. - Therefore, we 'reach' when we get 1 level before max_depth. + right below the current level. + Therefore, we 'reach' when we get 1 level *before* max_depth. """ return self._current_depth(path) == self.max_depth - 1 def stats(self): - """Equivalent of tree command's 'report line'. - TODO: add dataset count""" + """ + Equivalent of tree command's report line. + TODO: add dataset count + """ return f"{self._stats}\n" def generate_tree_items(self): - """Generator of directories/files, traversed in depth-first order.""" + """Generator of directories/files using depth-first traversal""" for path, dirs, files in os.walk(self.root): # exclude hidden files/directories unless specified by arg. From 6f191ec40fcb720f9987a9bb151bbe31a3215845 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 3 Jul 2022 18:06:17 +0200 Subject: [PATCH 006/131] set default depth to 1 (prevents annoying wall-of-text if forgot to specify depth parameter) --- datalad_next/tree.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d5f5d739..52e96e46 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -103,7 +103,7 @@ class Tree(Interface): @staticmethod @datasetmethod(name='tree') @eval_results - def __call__(path='.', *, depth=None, depth_mode='directory', + def __call__(path='.', *, depth=1, depth_mode='directory', datasets_only=False, include_files=False, include_hidden=False, full_paths=False): # print tree output @@ -125,6 +125,7 @@ class Walk(object): def __init__(self, root: str, max_depth: int, datasets_only=False, include_files=False, include_hidden=False, full_paths=False): + # TODO: validate parameters if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") self.root = root From c17868bb6e8aba7d50a1f456bf5b64335d98ad2b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 4 Jul 2022 21:42:45 +0200 Subject: [PATCH 007/131] remove parameter --full-paths, does not add much value --- datalad_next/tree.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 52e96e46..e62c1582 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -81,10 +81,6 @@ class Tree(Interface): args=("-a", "--include-hidden",), doc="""whether to include hidden files/directories in output display""", action='store_true'), - full_paths=Parameter( - args=("--full-paths",), - doc="""whether to display full paths""", - action='store_true'), ) _examples_ = [ @@ -94,17 +90,17 @@ class Tree(Interface): code_py="tree(depth=3, include_files=True)", code_cmd="datalad tree -L 3 --include-files"), dict(text="List all first- and second-level subdatasets " - "of datasets located anywhere under /tmp (regardless " - "of directory depth), displaying their full paths", - code_py="tree('/tmp', depth=2, depth_mode='dataset', datasets_only=True, full_paths=True)", - code_cmd="datalad tree /tmp -L 2 --depth-mode dataset --datasets-only --full-paths"), + "of datasets located anywhere under /tmp, " + "regardless of directory depth", + code_py="tree('/tmp', depth=2, depth_mode='dataset', datasets_only=Truec)", + code_cmd="datalad tree /tmp -L 2 --depth-mode dataset --datasets-only"), ] @staticmethod @datasetmethod(name='tree') @eval_results def __call__(path='.', *, depth=1, depth_mode='directory', - datasets_only=False, include_files=False, include_hidden=False, full_paths=False): + datasets_only=False, include_files=False, include_hidden=False): # print tree output walk = Walk(path, depth, datasets_only=datasets_only, include_files=include_files) @@ -124,7 +120,7 @@ class Walk(object): def __init__(self, root: str, max_depth: int, datasets_only=False, include_files=False, - include_hidden=False, full_paths=False): + include_hidden=False): # TODO: validate parameters if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") @@ -133,7 +129,6 @@ def __init__(self, root: str, max_depth: int, self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden - self.full_paths = full_paths self._output = "" self._last_children = [] self._stats = {'dir_count': 0, 'file_count': 0, 'dataset_count': 0} From cafd3d3280c2d74a0dc75a1af4539d49170ff3e3 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 5 Jul 2022 01:33:07 +0200 Subject: [PATCH 008/131] set up parametrized tests to cover combinations of datalad tree options --- datalad_next/conftest.py | 4 + datalad_next/tests/test_tree.py | 191 +++++++++++++++++++++++++++----- 2 files changed, 166 insertions(+), 29 deletions(-) diff --git a/datalad_next/conftest.py b/datalad_next/conftest.py index 7c727c32..80ea1d3f 100644 --- a/datalad_next/conftest.py +++ b/datalad_next/conftest.py @@ -14,3 +14,7 @@ def setup_package(): yield _teardown_package() + +# pytest hooks for pretty-formatting of parameter IDs +def pytest_make_parametrize_id(config, val, argname): + return f"{argname}={val}" diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 2cf4e4da..885f8ab0 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,29 +1,29 @@ import os +import pytest from datalad.tests.utils_pytest import ( assert_in, assert_not_in, assert_raises, eq_, neq_, - patch_config, - with_tree, + with_tree, assert_str_equal, ) +from datalad.utils import rmtemp + from ..tree import Tree, Walk """ Tests for datalad tree. -TODO: create fixture for precomputing tree output for a given set -of parameters, so it can be reused in multiple tests. """ -# directory layout to be tested that will be created as temp dir. -# directory index is equals to the count of its subdirectories -# and the (count-1) of files contained underneath it. -# TODO: generate programmatically instead of hardcoding -# (though it's easier to visualize if hardcoded) +# directory layout to be tested that will be created as temp dir _temp_dir_tree = { "root": { + ".dir3": { + "dir3_file0": 'tempfile', + ".dir3_file1": 'tempfile', + }, "dir0": {}, # empty dir "dir1": { "dir1_file0": 'tempfile', @@ -38,22 +38,67 @@ "dir2_dir2_file1": 'tempfile', }, "dir2_file0": 'tempfile', - "dir2_file1":'tempfile' + "dir2_file1": 'tempfile', }, + ".file2": 'tempfile', "file0": 'tempfile', "file1": 'tempfile', } } -@with_tree(_temp_dir_tree) -def test_build_tree_dirs_only(path=None): - root = os.path.join(path, 'root') - walk = Walk(root, max_depth=3, include_files=False) - walk.build_tree() - actual = walk.get_tree() +@pytest.fixture(scope="module") +def path(): + """ + Create a temporary directory tree once for the whole module, + to be used as test data for all tests. + This is a shim for the 'with_tree' decorator so it can be used + as module-scoped pytest fixture. + """ + # function to be decorated by 'with_tree' + # just return the argument (will be the created temp path) + identity_func = lambda d: d + + # give an informative name to the lambda function, since + # it will be included in the name of the temp dir + identity_func.__name__ = "test_tree" + + # call the 'with_tree' decorator to return the path + # of the created temp dir root, without deleting it + temp_dir_root = with_tree(_temp_dir_tree, delete=False)(identity_func)() + print(f"created temp dir at {temp_dir_root}") + yield temp_dir_root + rmtemp(temp_dir_root) # this duplicates 'with_tree' code + print(f"deleted temp dir at {temp_dir_root}") + + +def format_param_ids(val): + """Helper to format pytest parameter IDs. + If the parameter is a string containing newlines, we assume it + is the parameter 'expected' (expected output of tree), and just + give it a fixed ID.""" + if isinstance(val, str) and "\n" in val: + return "expected" - expected = f"""{root} + +param_matrix = [ + # (2 levels per param) ** (3 params) = 8 combinations + 8 expected results + # column names: depth, include_files, include_hidden, expected + [ + 1, + False, + False, + """ +├── dir0 +├── dir1 +└── dir2 +""" + ], + [ + 3, + False, + False, + """ ├── dir0 ├── dir1 └── dir2 @@ -61,19 +106,65 @@ def test_build_tree_dirs_only(path=None): ├── dir2_dir1 └── dir2_dir2 """ - eq_(expected, actual) - - -@with_tree(_temp_dir_tree) -def test_build_tree_including_files(path=None): - root = os.path.join(path, 'root') - walk = Walk(root, max_depth=3, include_files=True) - walk.build_tree() - actual = walk.get_tree() - - expected = f"""{root} + ], + [ + 1, + True, + False, + """ +├── file0 +├── file1 +├── dir0 +├── dir1 +└── dir2 +""" + ], + [ + 3, + True, + False, + """ +├── file0 +├── file1 +├── dir0 +├── dir1 +| └── dir1_file0 +└── dir2 + ├── dir2_file0 + ├── dir2_file1 + ├── dir2_dir0 + ├── dir2_dir1 + | └── dir2_dir1_file0 + └── dir2_dir2 + ├── dir2_dir2_file0 + └── dir2_dir2_file1 +""" + ], + [ + 1, + True, + True, + """ +├── .file2 +├── file0 +├── file1 +├── .dir3 +├── dir0 +├── dir1 +└── dir2 +""" + ], + [ + 3, + True, + True, + """ +├── .file2 ├── file0 ├── file1 +├── .dir3 +| ├── .dir3_file1 +| └── dir3_file0 ├── dir0 ├── dir1 | └── dir1_file0 @@ -87,5 +178,47 @@ def test_build_tree_including_files(path=None): ├── dir2_dir2_file0 └── dir2_dir2_file1 """ - eq_(expected, actual) + ], + [ + 1, + False, + True, + """ +├── .dir3 +├── dir0 +├── dir1 +└── dir2 +""" + ], + [ + 3, + False, + True, + """ +├── .dir3 +├── dir0 +├── dir1 +└── dir2 + ├── dir2_dir0 + ├── dir2_dir1 + └── dir2_dir2 +""" + ] +] + +@pytest.mark.parametrize( + ["depth", "include_files", "include_hidden", "expected"], + param_matrix, ids=format_param_ids +) +def test_print_tree_with_params( + path, depth, include_files, include_hidden, expected +): + root = os.path.join(path, 'root') + walk = Walk( + root, max_depth=depth, + include_files=include_files, include_hidden=include_hidden) + walk.build_tree() + actual_res = walk.get_tree() + expected_res = root + expected + assert_str_equal(expected_res, actual_res) From 7e7ab1d0c206a1d52d8561c9a225effc0849679c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 6 Jul 2022 21:52:48 +0200 Subject: [PATCH 009/131] major refactoring, add tests for tree stats --- datalad_next/tests/test_tree.py | 248 ++++++++++++--------- datalad_next/tree.py | 374 ++++++++++++++++++++------------ 2 files changed, 387 insertions(+), 235 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 885f8ab0..97fbdca5 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,4 +1,5 @@ import os +from random import random import pytest from datalad.tests.utils_pytest import ( @@ -7,53 +8,22 @@ assert_raises, eq_, neq_, - with_tree, assert_str_equal, + with_tree, assert_str_equal ) from datalad.utils import rmtemp -from ..tree import Tree, Walk +from ..tree import Tree """ Tests for datalad tree. """ -# directory layout to be tested that will be created as temp dir -_temp_dir_tree = { - "root": { - ".dir3": { - "dir3_file0": 'tempfile', - ".dir3_file1": 'tempfile', - }, - "dir0": {}, # empty dir - "dir1": { - "dir1_file0": 'tempfile', - }, - "dir2": { - "dir2_dir0": {}, - "dir2_dir1": { - "dir2_dir1_file0": 'tempfile', - }, - "dir2_dir2": { - "dir2_dir2_file0": 'tempfile', - "dir2_dir2_file1": 'tempfile', - }, - "dir2_file0": 'tempfile', - "dir2_file1": 'tempfile', - }, - ".file2": 'tempfile', - "file0": 'tempfile', - "file1": 'tempfile', - } -} - -@pytest.fixture(scope="module") -def path(): +def create_temp_dir_tree(tree_dict): """ - Create a temporary directory tree once for the whole module, - to be used as test data for all tests. + Create a temporary directory tree. This is a shim for the 'with_tree' decorator so it can be used - as module-scoped pytest fixture. + in a module-scoped pytest fixture. """ # function to be decorated by 'with_tree' # just return the argument (will be the created temp path) @@ -65,40 +35,79 @@ def path(): # call the 'with_tree' decorator to return the path # of the created temp dir root, without deleting it - temp_dir_root = with_tree(_temp_dir_tree, delete=False)(identity_func)() - print(f"created temp dir at {temp_dir_root}") + temp_dir_root = with_tree(tree_dict, delete=False)(identity_func)() + return temp_dir_root + + +@pytest.fixture(scope="module") +def path(): + """ + Fixture for temporary directory tree including nested + directories, without datasets + """ + dir_tree = { + "root": { + ".dir3": { + "dir3_file0": 'tempfile', + ".dir3_file1": 'tempfile', + }, + "dir0": {}, # empty dir + "dir1": { + "dir1_file0": 'tempfile', + }, + "dir2": { + "dir2_dir0": {}, + "dir2_dir1": { + "dir2_dir1_file0": 'tempfile', + }, + "dir2_dir2": { + "dir2_dir2_file0": 'tempfile', + "dir2_dir2_file1": 'tempfile', + }, + "dir2_file0": 'tempfile', + "dir2_file1": 'tempfile', + }, + ".file2": 'tempfile', + "file0": 'tempfile', + "file1": 'tempfile', + } + } + + temp_dir_root = create_temp_dir_tree(dir_tree) yield temp_dir_root - rmtemp(temp_dir_root) # this duplicates 'with_tree' code - print(f"deleted temp dir at {temp_dir_root}") + rmtemp(temp_dir_root) + assert not os.path.exists(temp_dir_root) def format_param_ids(val): """Helper to format pytest parameter IDs. - If the parameter is a string containing newlines, we assume it - is the parameter 'expected' (expected output of tree), and just + If the parameter is a multiline string, we assume it is the + parameter 'expected' (expected output of tree), and just give it a fixed ID.""" if isinstance(val, str) and "\n" in val: return "expected" -param_matrix = [ - # (2 levels per param) ** (3 params) = 8 combinations + 8 expected results - # column names: depth, include_files, include_hidden, expected - [ - 1, - False, - False, - """ +# Combinations of parameters to be tested and their expected results. +# (2 levels per param) ** (3 params) = 8 combinations + 8 expected results +param_combinations = [ + { + "depth": 1, + "include_files": False, + "include_hidden": False, + "expected_stats_str": "3 directories, 0 datasets, 0 files", + "expected_str": """ ├── dir0 ├── dir1 └── dir2 """ - ], - [ - 3, - False, - False, - """ + }, + { + "depth": 3, + "include_files": False, + "include_hidden": False, + "expected_stats_str": "6 directories, 0 datasets, 0 files", + "expected_str": """ ├── dir0 ├── dir1 └── dir2 @@ -106,24 +115,26 @@ def format_param_ids(val): ├── dir2_dir1 └── dir2_dir2 """ - ], - [ - 1, - True, - False, - """ + }, + { + "depth": 1, + "include_files": True, + "include_hidden": False, + "expected_stats_str": "3 directories, 0 datasets, 2 files", + "expected_str": """ ├── file0 ├── file1 ├── dir0 ├── dir1 └── dir2 """ - ], - [ - 3, - True, - False, - """ + }, + { + "depth": 3, + "include_files": True, + "include_hidden": False, + "expected_stats_str": "6 directories, 0 datasets, 8 files", + "expected_str": """ ├── file0 ├── file1 ├── dir0 @@ -139,12 +150,13 @@ def format_param_ids(val): ├── dir2_dir2_file0 └── dir2_dir2_file1 """ - ], - [ - 1, - True, - True, - """ + }, + { + "depth": 1, + "include_files": True, + "include_hidden": True, + "expected_stats_str": "4 directories, 0 datasets, 3 files", + "expected_str": """ ├── .file2 ├── file0 ├── file1 @@ -153,12 +165,13 @@ def format_param_ids(val): ├── dir1 └── dir2 """ - ], - [ - 3, - True, - True, - """ + }, + { + "depth": 3, + "include_files": True, + "include_hidden": True, + "expected_stats_str": "7 directories, 0 datasets, 11 files", + "expected_str": """ ├── .file2 ├── file0 ├── file1 @@ -178,23 +191,25 @@ def format_param_ids(val): ├── dir2_dir2_file0 └── dir2_dir2_file1 """ - ], - [ - 1, - False, - True, - """ + }, + { + "depth": 1, + "include_files": False, + "include_hidden": True, + "expected_stats_str": "4 directories, 0 datasets, 0 files", + "expected_str": """ ├── .dir3 ├── dir0 ├── dir1 └── dir2 """ - ], - [ - 3, - False, - True, - """ + }, + { + "depth": 3, + "include_files": False, + "include_hidden": True, + "expected_stats_str": "7 directories, 0 datasets, 0 files", + "expected_str": """ ├── .dir3 ├── dir0 ├── dir1 @@ -203,22 +218,55 @@ def format_param_ids(val): ├── dir2_dir1 └── dir2_dir2 """ - ] + }, ] +def build_param_matrix(param_names): + matrix = [] + for combination in param_combinations: + matrix.append( + # order of combinations does not matter + [val for key, val in combination.items() if key in param_names] + ) + return matrix + + @pytest.mark.parametrize( - ["depth", "include_files", "include_hidden", "expected"], - param_matrix, ids=format_param_ids + ["depth", "include_files", "include_hidden", "expected_str"], + build_param_matrix(["depth", "include_files", "include_hidden", "expected_str"]), ids=format_param_ids ) def test_print_tree_with_params( - path, depth, include_files, include_hidden, expected + path, depth, include_files, include_hidden, expected_str ): root = os.path.join(path, 'root') - walk = Walk( + tree = Tree( root, max_depth=depth, include_files=include_files, include_hidden=include_hidden) - walk.build_tree() - actual_res = walk.get_tree() - expected_res = root + expected + actual_res = str(tree) + expected_res = root + expected_str + assert_str_equal(expected_res, actual_res) + + +def test_print_tree_for_nonexistent_directory(): + """Obtain nonexistent directory by creating a temp dir + and deleting it (may be safest method)""" + nonexistent_dir = with_tree({"to_be_deleted": []})(lambda f: f)() + with assert_raises(ValueError): + Tree(nonexistent_dir, max_depth=1) + + +@pytest.mark.parametrize( + ["depth", "include_files", "include_hidden", "expected_stats_str"], + build_param_matrix(["depth", "include_files", "include_hidden", "expected_stats_str"]) +) +def test_tree_stats( + path, depth, include_files, include_hidden, expected_stats_str +): + root = os.path.join(path, 'root') + tree = Tree( + root, max_depth=depth, + include_files=include_files, include_hidden=include_hidden).build() + actual_res = tree.stats() + expected_res = expected_stats_str + "\n" assert_str_equal(expected_res, actual_res) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index e62c1582..8cac9a98 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -6,13 +6,46 @@ # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## -"""'tree'-like command for visualization of dataset hierarchies""" +""" +'tree'-like command for visualization of dataset hierarchies. + +This command covers 2 main use cases: + +(1) Glorified `tree` command: + --- + As a datalad user, I want to list the contents of a directory tree and + see which directories are datalad datasets, so that I can locate my + datasets in the context of the whole directory layout. + --- + This is basically what is implemented by the `tree-datalad` utility -- + just `tree` with visual markers for datasets. + In addition to it, `datalad-tree` provides the following: + 1. The subdataset hierarchy level information + (included in the dataset marker, e.g. [DS~0]). + This is the absolute level, meaning it may take into account + superdatasets that are not included in the display. + 2. The option to list only directories that are datasets + 3. The count of displayed datasets in the "report line" + (where `tree` only reports count of directories and files) + +(2) Descriptor of nested subdataset hierarchies: + --- + As a datalad user, I want to visualize the structure of multiple datasets + and their hierarchies at once based on the subdataset nesting level, + regardless of their actual depth in the directory tree. This helps me + understand and communicate the layout of my datasets. + --- + This is the more datalad-specific case. Here we redefine 'depth' as the + level in the subdataset hierarchy instead of the filesystem hierarchy. + +""" __docformat__ = 'restructuredtext' import json import logging import os +from functools import wraps from datalad.interface.base import ( Interface, @@ -42,7 +75,7 @@ @build_doc -class Tree(Interface): +class TreeCommand(Interface): """Visualize directory and dataset hierarchies This command mimics the UNIX/MSDOS 'tree' command to display a @@ -59,16 +92,13 @@ class Tree(Interface): Defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), depth=Parameter( - args=("-L", "--depth",), - doc="""maximum tree depth to display. Can refer to either - directory depth or dataset hierarchy depth, depending on - the value of [CMD: --depth-mode CMD][PY: `depth_mode` PY].""", - constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone()), - depth_mode=Parameter( - args=("--depth-mode",), - doc="""interpret [CMD: --depth CMD][PY: `depth` PY] parameter to mean either - directory depth or subdataset hierarchy depth.""", - constraints=EnsureChoice("directory", "dataset")), + args=("-L", "--directory-depth",), + doc="""maximum depth of directory tree to display""", + constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), + dataset_depth=Parameter( + args=("-R", "--dataset-depth",), + doc="""maximum depth of nested subdatasets to display""", + constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), datasets_only=Parameter( args=("--datasets-only",), doc="""whether to only list directories that are datasets""", @@ -99,14 +129,13 @@ class Tree(Interface): @staticmethod @datasetmethod(name='tree') @eval_results - def __call__(path='.', *, depth=1, depth_mode='directory', + def __call__(path='.', *, depth=1, dataset_depth=None, datasets_only=False, include_files=False, include_hidden=False): # print tree output - walk = Walk(path, depth, datasets_only=datasets_only, include_files=include_files) - walk.build_tree() - print(walk.get_tree()) - print(walk.stats()) + tree = Tree(path, depth, datasets_only=datasets_only, include_files=include_files) + print(tree) + print(tree.stats()) # return a generic OK status yield get_status_dict( @@ -116,32 +145,70 @@ def __call__(path='.', *, depth=1, depth_mode='directory', ) -class Walk(object): +def increment_node_count(node_generator_func): + """ + Decorator for incrementing the node count whenever a _TreeNode is yielded. + """ + @wraps(node_generator_func) + def _wrapper(*args, **kwargs): + self = args[0] # 'self' is a Tree instance + for node in node_generator_func(*args, **kwargs): + node_type = node.__class__.__name__ + if node_type not in self._stats: + raise ValueError(f"No stats collected for unknown node type '{node_type}'") + if node.depth > 0: # we do not count the root directory + self._stats[node_type] += 1 + + yield node # yield what the generator yielded + + return _wrapper + + +def is_path_child_of_parent(child, parent): + parent_abs = os.path.abspath(parent) + child_abs = os.path.abspath(child) + return os.path.commonpath([parent_abs]) == \ + os.path.commonpath([parent_abs, child_abs]) - def __init__(self, root: str, max_depth: int, + +class Tree(object): + """ + Does not store _TreeNode objects, only the string representation + of the whole tree and the statistics (counts of different node types). + """ + + def __init__(self, root: str, max_depth: int, dataset_max_depth=None, datasets_only=False, include_files=False, include_hidden=False): # TODO: validate parameters if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") + # TODO: sanitize / normalize root path self.root = root self.max_depth = max_depth + self.dataset_max_depth = dataset_max_depth self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden - self._output = "" + self._current_dataset_depth = -1 + self._string_repr = "" # holds the serialized representation self._last_children = [] - self._stats = {'dir_count': 0, 'file_count': 0, 'dataset_count': 0} + # TODO: stats should automatically register all concrete _TreeNode classes + self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} - def get_tree(self): - return self._output + def __str__(self): + """Return serialized Tree representation""" + if self._string_repr != "": + # tree has already been serialized + return self._string_repr + return self._build_string() def _current_depth(self, path: str): """Directory depth of current path relative to root of the walk""" # directory depth can be safely inferred from the number of # path separators in path, since pathsep characters are illegal # in file or directory names. - # TODO: sanitize / normalize root path in Walk constructor + # TODO: sanitize / normalize root path in Tree constructor return path.count(os.path.sep) - self.root.rstrip(os.path.sep).count(os.path.sep) def _is_last_child(self, path): @@ -154,33 +221,53 @@ def _is_max_depth_reached(self, path): any further directories in the next iteration. However, we will still list any directories or files right below the current level. - Therefore, we 'reach' when we get 1 level *before* max_depth. + Therefore, we 'reach' when we get to 1 level *before* max_depth. """ return self._current_depth(path) == self.max_depth - 1 + def _is_max_dataset_depth_reached(self, path): + pass + def stats(self): """ - Equivalent of tree command's report line. - TODO: add dataset count + Equivalent of tree command's 'report line'. + """ + return f"{self._stats['DirectoryNode']} directories, " \ + f"{self._stats['DatasetNode']} datasets, " \ + f"{self._stats['FileNode']} files" \ + "\n" + + def build(self): + """Public API for constructing tree. + Returns the instance.""" + self._build_string() + return self + + @increment_node_count + def _generate_nodes(self): + """ + Yields _TreeNode objects, each representing a directory, dataset + or file. Nodes are traversed in depth-first order. """ - return f"{self._stats}\n" - def generate_tree_items(self): - """Generator of directories/files using depth-first traversal""" + # os.walk() does depth-first traversal for path, dirs, files in os.walk(self.root): - # exclude hidden files/directories unless specified by arg. - # we modify os.walk's output in-place. + # modify os.walk()'s output in-place to prevent + # traversal into those directories if not self.include_hidden: dirs[:] = [d for d in dirs if not d.startswith(".")] files[:] = [f for f in files if not f.startswith(".")] - # sort directories and files alphabetically in-place + # sort directories and files alphabetically in-place. + # note that directories and files are sorted separately. + # files are all listed before the directories + # (just by convention, no particular reason). dirs.sort() files.sort() - # check if item is the last child within its subtree - # (needed for applying special formatting) + # check if node is the last child within its subtree + # (needed for displaying special end-of-subtree prefix) if dirs or files: # if there is a next level # files are listed first, directories come last. # so we take the last subdirectory if it exists, @@ -190,162 +277,179 @@ def generate_tree_items(self): ) current_depth = self._current_depth(path) - item = DirectoryItem(path, current_depth, self._is_last_child(path)) - if not self.datasets_only or self.datasets_only and item.is_dataset(): - yield item - if current_depth > 0: - self._stats['dir_count'] += 1 # do not count root directory + # handle directories/datasets + dir_or_ds = DirectoryOrDatasetNode(path, current_depth, + self._is_last_child(path)) + if not self.datasets_only or \ + self.datasets_only and isinstance(dir_or_ds, DatasetNode): + yield dir_or_ds + # handle files if self.include_files: for file in files: file_path = os.path.join(path, file) - yield FileItem(file_path, current_depth + 1, + yield FileNode(file_path, current_depth + 1, self._is_last_child(file_path)) - self._stats['file_count'] += 1 if self._is_max_depth_reached(path): # generate any remaining directory items, which # will not be traversed for child_dir in dirs: dir_path = os.path.join(path, child_dir) - yield DirectoryItem(dir_path, current_depth + 1, + yield DirectoryNode(dir_path, current_depth + 1, self._is_last_child(dir_path)) - self._stats['dir_count'] += 1 - # empty in-place the list of next directories - # to traverse. this effectively stops os.walk's walking. + # empty in-place the list of next directories to + # traverse, which effectively stops os.walk's walking dirs[:] = [] - def build_tree(self): + def _build_string(self): """ - Structure of tree output line (express in BNF?): - [padding]?[prefix]?[path] + Return tree as string, where each line represents a node + (directory or dataset or file). + Each line follows the structure: + `[] [] ` Example: - `| | |– mydir` + | | ├── path_dir_level3 """ + # keep track of levels where subtree is exhaused, # i.e. we have reached the last child of the subtree. - # this is needed to build the padding string for each item, + # this is needed to build the indentation string for each item, # which takes into account whether any parent - # is the last item of its own subtree. + # is the last node of its own subtree. levels_with_exhausted_subtree = set([]) - for item in self.generate_tree_items(): - lgr.debug(item) + for node in self._generate_nodes(): + lgr.debug(node) - if item.is_last_child: # last child of its subtree - levels_with_exhausted_subtree.add(item.depth) + if node.is_last_child: # last child of its subtree + levels_with_exhausted_subtree.add(node.depth) else: # 'discard' does not raise exception # if value does not exist in set - levels_with_exhausted_subtree.discard(item.depth) + levels_with_exhausted_subtree.discard(node.depth) - path = item.path # display of item path - padding = "" # vertical lines for continuing the parent subtrees - prefix = "" # single indentation symbol for the given item + path = node.path + indentation = "" # vertical continuation lines + branch_tip = "" # prefix symbol for the given node - if item.depth > 0: + if node.depth > 0: # for non-root items, display the basename - path = os.path.basename(item.path) - - # build padding string - padding_symbols_for_levels = [ - "| " - if level not in levels_with_exhausted_subtree - else " " - for level in range(1, item.depth) + path = os.path.basename(node.path) + + # build indentation string + indentation_symbols_for_levels = [ + " " + if level in levels_with_exhausted_subtree + else "| " + for level in range(1, node.depth) ] - padding = ''.join(padding_symbols_for_levels) + indentation = "".join(indentation_symbols_for_levels) - # set prefix - if item.is_last_child: - prefix = "└── " + # prepend prefix to path + if node.is_last_child: + branch_tip = "└── " else: - prefix = "├── " - - self._output += (padding + prefix + path + "\n") - - -class DirectoryWalk(Walk): - """ - Traverse a hierarchy of directories. - In this context, 'depth' means directory depth. - """ - pass + branch_tip = "├── " + self._string_repr += (indentation + branch_tip + path + "\n") -class DatasetWalk(Walk): - """ - Traverse a hierarchy of DataLad datasets and subdatasets. - In this context, 'depth' means level of subdataset nesting - (only for datasets installed as subdatasets). - Considers only proper DataLad datasets (with a dataset ID), - not regular git/git-annex repos. - """ - - @staticmethod - def _current_subdataset_depth(path): - """Subdataset level relative to the root path. - For example, if building the tree starting from a direct - subdataset of a top-level parent dataset, will return - depth 0 for the subdataset root, depth 1 for the - sub-subdataset, etc.""" - - # TODO: make sure we consider datasets only strictly datalad - # datasets, not any git repo (may be confusing for users) - return 0 + return self._string_repr - def _is_max_depth_reached(self, path): - return self._current_subdataset_depth(path) > self.max_depth - -class _TreeItem(object): +class _TreeNode(object): """ - Base class for a directory or file represented in a single - line of the 'tree' output. + Base class for a directory or file represented as a single + tree node and printed as single line of the 'tree' output. """ def __init__(self, path: str, depth: int, is_last_child): self.path = path - self.depth = depth # directory depth + self.depth = depth # depth in the directory tree self.is_last_child = is_last_child # if it is last item of its subtree def __str__(self): return self.path - def format(self): - raise NotImplementedError("implemented by subclasses") + def _get_tree_root(self): + """Calculate tree root path from node path and depth""" + root = self.path + for _ in range(self.depth): + root = os.path.dirname(root) + return root -class DirectoryItem(_TreeItem): - def is_dataset(self): - try: - ds = require_dataset(self.path, check_installed=True) - return ds.id is not None - except (NoDatasetFound, AttributeError): - return False +class DirectoryNode(_TreeNode): + pass -class FileItem(_TreeItem): +class FileNode(_TreeNode): pass -class DatasetItem(_TreeItem): - def __init__(self, *args, abs_subds_depth=None, **kwargs): - # absolute subdataset depth: - # if None, it is not a dataset (or it is a . - # if 0, it is a top-level dataset. - self.abs_subds_depth = abs_subds_depth +class DirectoryOrDatasetNode(_TreeNode): + """ + Factory class for creating either a DirectoryNode or DatasetNode, + based on whether the current path is a dataset or not. + """ + def __new__(cls, path, *args, **kwargs): + if cls.is_dataset(path): + ds_node = DatasetNode(path, *args, **kwargs) + ds_node.calculate_dataset_depth() + return ds_node + else: + return DirectoryNode(path, *args, **kwargs) + + @staticmethod + def is_dataset(path): + """ + We infer that a directory is a dataset if it is either: + (A) installed, or + (B) not installed, but it has an installed superdatset. + """ + ds = require_dataset(path, check_installed=False) + superds = ds.get_superdataset(datalad_only=True, topmost=False, + registered_only=True) + return superds is not None + + +class DatasetNode(_TreeNode): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def _absolute_subdataset_depth(self, path): - """Subdataset level in the context of the full dataset - hierarchy. - For example, if building the tree starting from a direct - subdataset of a top-level parent dataset, will return depth 1 - for the subdataset, depth 2 for the sub-subdataset, etc.""" + self.ds = require_dataset(self.path, check_installed=False) + self.is_installed = self.ds.is_installed() + self._ds_depth = None + self._absolute_ds_depth = None - # TODO: check how recursion levels are handled e.g. in datalad status - pass + def calculate_dataset_depth(self): + """ + Calculate 2 measures of a dataset's nesting depth/level: + 1. subdataset depth relative to the tree root + 2. absolute subdataset depth in the full hierarchy + """ + self._ds_depth = 0 + self._absolute_ds_depth = 0 + + ds = self.ds + while ds: + superds = ds.get_superdataset( + datalad_only=True, topmost=False, registered_only=True) + + if superds is None: + # it is not a dataset, do nothing + break + else: + if superds == ds: + # it is a top-level dataset, we are done + break + self._absolute_ds_depth += 1 + if is_path_child_of_parent(superds.path, self._get_tree_root()): + # if the parent dataset is underneath the tree + # root, we increment the relative depth + self._ds_depth += 1 + + ds = superds From da3102303b80e50728c73847d53097a991b5fac0 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 6 Jul 2022 23:01:27 +0200 Subject: [PATCH 010/131] fix class name registered as command implementation class --- datalad_next/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/__init__.py b/datalad_next/__init__.py index 39710ed2..959019c8 100644 --- a/datalad_next/__init__.py +++ b/datalad_next/__init__.py @@ -32,7 +32,7 @@ # importable module that contains the command implementation 'datalad_next.tree', # name of the command class implementation in above module - 'Tree', + 'TreeCommand', ) ] ) From 24ace590bf38f2dfe679aff8093d9671fc638bcc Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 6 Jul 2022 23:50:23 +0200 Subject: [PATCH 011/131] customize print format for different node types --- datalad_next/tests/test_tree.py | 80 ++++++++++++++++----------------- datalad_next/tree.py | 36 ++++++++------- 2 files changed, 59 insertions(+), 57 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 97fbdca5..a9391a50 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -97,9 +97,9 @@ def format_param_ids(val): "include_hidden": False, "expected_stats_str": "3 directories, 0 datasets, 0 files", "expected_str": """ -├── dir0 -├── dir1 -└── dir2 +├── dir0/ +├── dir1/ +└── dir2/ """ }, { @@ -108,12 +108,12 @@ def format_param_ids(val): "include_hidden": False, "expected_stats_str": "6 directories, 0 datasets, 0 files", "expected_str": """ -├── dir0 -├── dir1 -└── dir2 - ├── dir2_dir0 - ├── dir2_dir1 - └── dir2_dir2 +├── dir0/ +├── dir1/ +└── dir2/ + ├── dir2_dir0/ + ├── dir2_dir1/ + └── dir2_dir2/ """ }, { @@ -124,9 +124,9 @@ def format_param_ids(val): "expected_str": """ ├── file0 ├── file1 -├── dir0 -├── dir1 -└── dir2 +├── dir0/ +├── dir1/ +└── dir2/ """ }, { @@ -137,16 +137,16 @@ def format_param_ids(val): "expected_str": """ ├── file0 ├── file1 -├── dir0 -├── dir1 +├── dir0/ +├── dir1/ | └── dir1_file0 -└── dir2 +└── dir2/ ├── dir2_file0 ├── dir2_file1 - ├── dir2_dir0 - ├── dir2_dir1 + ├── dir2_dir0/ + ├── dir2_dir1/ | └── dir2_dir1_file0 - └── dir2_dir2 + └── dir2_dir2/ ├── dir2_dir2_file0 └── dir2_dir2_file1 """ @@ -160,10 +160,10 @@ def format_param_ids(val): ├── .file2 ├── file0 ├── file1 -├── .dir3 -├── dir0 -├── dir1 -└── dir2 +├── .dir3/ +├── dir0/ +├── dir1/ +└── dir2/ """ }, { @@ -175,19 +175,19 @@ def format_param_ids(val): ├── .file2 ├── file0 ├── file1 -├── .dir3 +├── .dir3/ | ├── .dir3_file1 | └── dir3_file0 -├── dir0 -├── dir1 +├── dir0/ +├── dir1/ | └── dir1_file0 -└── dir2 +└── dir2/ ├── dir2_file0 ├── dir2_file1 - ├── dir2_dir0 - ├── dir2_dir1 + ├── dir2_dir0/ + ├── dir2_dir1/ | └── dir2_dir1_file0 - └── dir2_dir2 + └── dir2_dir2/ ├── dir2_dir2_file0 └── dir2_dir2_file1 """ @@ -198,10 +198,10 @@ def format_param_ids(val): "include_hidden": True, "expected_stats_str": "4 directories, 0 datasets, 0 files", "expected_str": """ -├── .dir3 -├── dir0 -├── dir1 -└── dir2 +├── .dir3/ +├── dir0/ +├── dir1/ +└── dir2/ """ }, { @@ -210,13 +210,13 @@ def format_param_ids(val): "include_hidden": True, "expected_stats_str": "7 directories, 0 datasets, 0 files", "expected_str": """ -├── .dir3 -├── dir0 -├── dir1 -└── dir2 - ├── dir2_dir0 - ├── dir2_dir1 - └── dir2_dir2 +├── .dir3/ +├── dir0/ +├── dir1/ +└── dir2/ + ├── dir2_dir0/ + ├── dir2_dir1/ + └── dir2_dir2/ """ }, ] diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 8cac9a98..ab414aef 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -331,15 +331,9 @@ def _build_string(self): # if value does not exist in set levels_with_exhausted_subtree.discard(node.depth) - path = node.path - indentation = "" # vertical continuation lines - branch_tip = "" # prefix symbol for the given node - + # build indentation string + indentation = "" if node.depth > 0: - # for non-root items, display the basename - path = os.path.basename(node.path) - - # build indentation string indentation_symbols_for_levels = [ " " if level in levels_with_exhausted_subtree @@ -348,13 +342,7 @@ def _build_string(self): ] indentation = "".join(indentation_symbols_for_levels) - # prepend prefix to path - if node.is_last_child: - branch_tip = "└── " - else: - branch_tip = "├── " - - self._string_repr += (indentation + branch_tip + path + "\n") + self._string_repr += (indentation + str(node) + "\n") return self._string_repr @@ -371,7 +359,12 @@ def __init__(self, path: str, depth: int, is_last_child): self.is_last_child = is_last_child # if it is last item of its subtree def __str__(self): - return self.path + path = os.path.basename(self.path) if self.depth > 0 else self.path + prefix = "" + if self.depth > 0: + prefix = "└── " if self.is_last_child else "├── " + + return prefix + path def _get_tree_root(self): """Calculate tree root path from node path and depth""" @@ -382,7 +375,11 @@ def _get_tree_root(self): class DirectoryNode(_TreeNode): - pass + def __str__(self): + string = super().__str__() + if self.depth > 0: + return string + "/" + return string class FileNode(_TreeNode): @@ -424,6 +421,11 @@ def __init__(self, *args, **kwargs): self._ds_depth = None self._absolute_ds_depth = None + def __str__(self): + installed_flag = " not_installed" if not self.is_installed else "" + suffix = f" [DS~{self._absolute_ds_depth}{installed_flag}]" + return super().__str__() + suffix + def calculate_dataset_depth(self): """ Calculate 2 measures of a dataset's nesting depth/level: From 599f85439ea666d1dd8c7413641d334150132b96 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 6 Jul 2022 23:51:27 +0200 Subject: [PATCH 012/131] fix command parameter names and examples --- datalad_next/tree.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index ab414aef..0db831b9 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -92,12 +92,12 @@ class TreeCommand(Interface): Defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), depth=Parameter( - args=("-L", "--directory-depth",), - doc="""maximum depth of directory tree to display""", + args=("-L", "--depth",), + doc="""maximum level of directory tree to display""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("-R", "--dataset-depth",), - doc="""maximum depth of nested subdatasets to display""", + doc="""maximum level of nested subdatasets to display""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), datasets_only=Parameter( args=("--datasets-only",), @@ -115,15 +115,19 @@ class TreeCommand(Interface): _examples_ = [ dict( - text="Display up to 3 levels of subdirectories and their " - "contents starting from the current directory", + text="Display up to 3 levels of subdirectories and their contents " + "including files, starting from the current directory", code_py="tree(depth=3, include_files=True)", code_cmd="datalad tree -L 3 --include-files"), dict(text="List all first- and second-level subdatasets " - "of datasets located anywhere under /tmp, " + "of parent datasets located anywhere under /tmp, " "regardless of directory depth", - code_py="tree('/tmp', depth=2, depth_mode='dataset', datasets_only=Truec)", - code_cmd="datalad tree /tmp -L 2 --depth-mode dataset --datasets-only"), + code_py="tree('/tmp', dataset_depth=2, datasets_only=Truec)", + code_cmd="datalad tree /tmp -R 2 --datasets-only"), + dict(text="Display first- and second-level subdatasets and their" + "contents up to 3 directories deep (within each subdataset)", + code_py="tree('.', dataset_depth=2, directory_depth=1)", + code_cmd="datalad tree -R 2 -L 3"), ] @staticmethod From 689c3038bdd98f0183695c2f68d915f729660188 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 6 Jul 2022 23:51:59 +0200 Subject: [PATCH 013/131] clean up docstrings --- datalad_next/tree.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 0db831b9..e008fc0f 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -12,11 +12,11 @@ This command covers 2 main use cases: (1) Glorified `tree` command: - --- + ---- As a datalad user, I want to list the contents of a directory tree and see which directories are datalad datasets, so that I can locate my datasets in the context of the whole directory layout. - --- + ---- This is basically what is implemented by the `tree-datalad` utility -- just `tree` with visual markers for datasets. In addition to it, `datalad-tree` provides the following: @@ -151,7 +151,7 @@ def __call__(path='.', *, depth=1, dataset_depth=None, def increment_node_count(node_generator_func): """ - Decorator for incrementing the node count whenever a _TreeNode is yielded. + Decorator for incrementing the node count whenever a ``_TreeNode`` is yielded. """ @wraps(node_generator_func) def _wrapper(*args, **kwargs): @@ -177,24 +177,25 @@ def is_path_child_of_parent(child, parent): class Tree(object): """ - Does not store _TreeNode objects, only the string representation + Main class for building and serializing a directory tree. + Does not store ``_TreeNode`` objects, only the string representation of the whole tree and the statistics (counts of different node types). """ def __init__(self, root: str, max_depth: int, dataset_max_depth=None, datasets_only=False, include_files=False, include_hidden=False): + # TODO: validate parameters + # TODO: sanitize / normalize root path if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") - # TODO: sanitize / normalize root path self.root = root self.max_depth = max_depth self.dataset_max_depth = dataset_max_depth self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden - self._current_dataset_depth = -1 self._string_repr = "" # holds the serialized representation self._last_children = [] # TODO: stats should automatically register all concrete _TreeNode classes @@ -208,7 +209,7 @@ def __str__(self): return self._build_string() def _current_depth(self, path: str): - """Directory depth of current path relative to root of the walk""" + """Directory depth of current path relative to root of the tree""" # directory depth can be safely inferred from the number of # path separators in path, since pathsep characters are illegal # in file or directory names. @@ -234,7 +235,8 @@ def _is_max_dataset_depth_reached(self, path): def stats(self): """ - Equivalent of tree command's 'report line'. + Equivalent of tree command's 'report line' at the end of the + tree output. """ return f"{self._stats['DirectoryNode']} directories, " \ f"{self._stats['DatasetNode']} datasets, " \ @@ -284,7 +286,7 @@ def _generate_nodes(self): # handle directories/datasets dir_or_ds = DirectoryOrDatasetNode(path, current_depth, - self._is_last_child(path)) + self._is_last_child(path)) if not self.datasets_only or \ self.datasets_only and isinstance(dir_or_ds, DatasetNode): yield dir_or_ds @@ -313,15 +315,15 @@ def _build_string(self): Return tree as string, where each line represents a node (directory or dataset or file). Each line follows the structure: - `[] [] ` - Example: - | | ├── path_dir_level3 + ``[] [] `` + Example line: + ``| | ├── path_dir_level3`` """ - # keep track of levels where subtree is exhaused, - # i.e. we have reached the last child of the subtree. - # this is needed to build the indentation string for each item, - # which takes into account whether any parent + # keep track of levels where subtree is exhaused, i.e. + # we have reached the last child of the subtree. + # this is needed to build the indentation string for each + # node, which takes into account whether any parent # is the last node of its own subtree. levels_with_exhausted_subtree = set([]) @@ -392,7 +394,7 @@ class FileNode(_TreeNode): class DirectoryOrDatasetNode(_TreeNode): """ - Factory class for creating either a DirectoryNode or DatasetNode, + Factory class for creating either a ``DirectoryNode`` or ``DatasetNode``, based on whether the current path is a dataset or not. """ def __new__(cls, path, *args, **kwargs): From de259a48637e3f016370fe448c882ba7d466588a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 7 Jul 2022 19:47:04 +0200 Subject: [PATCH 014/131] add methods for yielding string output lines --- datalad_next/tests/test_tree.py | 38 ++++++------ datalad_next/tree.py | 103 +++++++++++++++++++++----------- 2 files changed, 89 insertions(+), 52 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index a9391a50..b3c806cd 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,5 +1,5 @@ import os -from random import random +from datetime import datetime import pytest from datalad.tests.utils_pytest import ( @@ -48,28 +48,28 @@ def path(): dir_tree = { "root": { ".dir3": { - "dir3_file0": 'tempfile', - ".dir3_file1": 'tempfile', + "dir3_file0": '', + ".dir3_file1": '', }, "dir0": {}, # empty dir "dir1": { - "dir1_file0": 'tempfile', + "dir1_file0": '', }, "dir2": { "dir2_dir0": {}, "dir2_dir1": { - "dir2_dir1_file0": 'tempfile', + "dir2_dir1_file0": '', }, "dir2_dir2": { - "dir2_dir2_file0": 'tempfile', - "dir2_dir2_file1": 'tempfile', + "dir2_dir2_file0": '', + "dir2_dir2_file1": '', }, - "dir2_file0": 'tempfile', - "dir2_file1": 'tempfile', + "dir2_file0": '', + "dir2_file1": '', }, - ".file2": 'tempfile', - "file0": 'tempfile', - "file1": 'tempfile', + ".file2": '', + "file0": '', + "file1": '', } } @@ -239,19 +239,23 @@ def build_param_matrix(param_names): def test_print_tree_with_params( path, depth, include_files, include_hidden, expected_str ): - root = os.path.join(path, 'root') + root = os.path.join(path, "root") tree = Tree( root, max_depth=depth, include_files=include_files, include_hidden=include_hidden) - actual_res = str(tree) - expected_res = root + expected_str + # skip the first line with the root directory + # as we will test it separately + lines = (l for i, l in enumerate(tree.print_line()) if i > 0) + actual_res = "\n".join(lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline assert_str_equal(expected_res, actual_res) def test_print_tree_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" - nonexistent_dir = with_tree({"to_be_deleted": []})(lambda f: f)() + dir_name = f"to_be_deleted_{datetime.now().timestamp()}" + nonexistent_dir = with_tree({dir_name: []})(lambda f: f)() with assert_raises(ValueError): Tree(nonexistent_dir, max_depth=1) @@ -268,5 +272,5 @@ def test_tree_stats( root, max_depth=depth, include_files=include_files, include_hidden=include_hidden).build() actual_res = tree.stats() - expected_res = expected_stats_str + "\n" + expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index e008fc0f..5aa35d60 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -137,9 +137,15 @@ def __call__(path='.', *, depth=1, dataset_depth=None, datasets_only=False, include_files=False, include_hidden=False): # print tree output - tree = Tree(path, depth, datasets_only=datasets_only, include_files=include_files) - print(tree) - print(tree.stats()) + tree = Tree( + path, depth, dataset_max_depth=dataset_depth, + datasets_only=datasets_only, + include_files=include_files, include_hidden=include_hidden) + + for line in tree.print_line(): + # print one line at a time to improve perceived speed + print(line) + print("\n" + tree.stats() + "\n") # return a generic OK status yield get_status_dict( @@ -187,34 +193,25 @@ def __init__(self, root: str, max_depth: int, dataset_max_depth=None, include_hidden=False): # TODO: validate parameters - # TODO: sanitize / normalize root path if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") - self.root = root + self.root = os.path.normpath(root) self.max_depth = max_depth self.dataset_max_depth = dataset_max_depth self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden - self._string_repr = "" # holds the serialized representation + self._lines = [] # holds the list of lines of output string self._last_children = [] # TODO: stats should automatically register all concrete _TreeNode classes self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} - def __str__(self): - """Return serialized Tree representation""" - if self._string_repr != "": - # tree has already been serialized - return self._string_repr - return self._build_string() - def _current_depth(self, path: str): """Directory depth of current path relative to root of the tree""" # directory depth can be safely inferred from the number of # path separators in path, since pathsep characters are illegal # in file or directory names. - # TODO: sanitize / normalize root path in Tree constructor - return path.count(os.path.sep) - self.root.rstrip(os.path.sep).count(os.path.sep) + return path.count(os.path.sep) - self.root.count(os.path.sep) def _is_last_child(self, path): """Whether an item is the last child within its subtree""" @@ -237,16 +234,21 @@ def stats(self): """ Equivalent of tree command's 'report line' at the end of the tree output. + Only counts contents below the root directory, does not count + the root itself. """ return f"{self._stats['DirectoryNode']} directories, " \ f"{self._stats['DatasetNode']} datasets, " \ - f"{self._stats['FileNode']} files" \ - "\n" + f"{self._stats['FileNode']} files" + + def _total_nodes(self): + return sum(c for c in self._stats.values()) def build(self): - """Public API for constructing tree. - Returns the instance.""" - self._build_string() + """ + Construct the tree string representation and return back the instance. + """ + self.to_string() return self @increment_node_count @@ -287,7 +289,8 @@ def _generate_nodes(self): # handle directories/datasets dir_or_ds = DirectoryOrDatasetNode(path, current_depth, self._is_last_child(path)) - if not self.datasets_only or \ + if current_depth == 0 or \ + not self.datasets_only or \ self.datasets_only and isinstance(dir_or_ds, DatasetNode): yield dir_or_ds @@ -299,22 +302,48 @@ def _generate_nodes(self): self._is_last_child(file_path)) if self._is_max_depth_reached(path): - # generate any remaining directory items, which - # will not be traversed + # generate any remaining directory/dataset nodes, + # which will not be traversed in the next iteration for child_dir in dirs: dir_path = os.path.join(path, child_dir) - yield DirectoryNode(dir_path, current_depth + 1, - self._is_last_child(dir_path)) + + dir_or_ds = DirectoryOrDatasetNode( + dir_path, current_depth + 1, + self._is_last_child(dir_path)) + + if not self.datasets_only or \ + self.datasets_only and isinstance(dir_or_ds, + DatasetNode): + yield dir_or_ds # empty in-place the list of next directories to # traverse, which effectively stops os.walk's walking dirs[:] = [] - def _build_string(self): + def to_string(self): + """Return complete tree as string""" + if not self._lines: + return "\n".join(list(self.print_line())) + return self._lines + + def print_line(self): + """Generator for tree output lines""" + if not self._lines: + # string output has not been generated yet + for line in self._yield_lines(): + self._lines.append(line) + yield line + else: + # string output is already generated + for line in self._lines: + yield line + yield "\n" # newline at the very end + + def _yield_lines(self): """ - Return tree as string, where each line represents a node - (directory or dataset or file). - Each line follows the structure: + Generator of lines of the tree string representation. + Each line represents a node (directory or dataset or file). + A line follows the structure: ``[] [] `` Example line: ``| | ├── path_dir_level3`` @@ -348,9 +377,8 @@ def _build_string(self): ] indentation = "".join(indentation_symbols_for_levels) - self._string_repr += (indentation + str(node) + "\n") - - return self._string_repr + line = indentation + str(node) + yield line class _TreeNode(object): @@ -413,12 +441,16 @@ def is_dataset(path): (B) not installed, but it has an installed superdatset. """ ds = require_dataset(path, check_installed=False) + if ds.is_installed(): + return True + + # check if it has an installed superdataset superds = ds.get_superdataset(datalad_only=True, topmost=False, registered_only=True) return superds is not None -class DatasetNode(_TreeNode): +class DatasetNode(DirectoryNode): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -428,8 +460,8 @@ def __init__(self, *args, **kwargs): self._absolute_ds_depth = None def __str__(self): - installed_flag = " not_installed" if not self.is_installed else "" - suffix = f" [DS~{self._absolute_ds_depth}{installed_flag}]" + install_flag = ", not installed" if not self.is_installed else "" + suffix = f" [DS~{self._absolute_ds_depth}{install_flag}]" return super().__str__() + suffix def calculate_dataset_depth(self): @@ -454,6 +486,7 @@ def calculate_dataset_depth(self): if superds == ds: # it is a top-level dataset, we are done break + self._absolute_ds_depth += 1 if is_path_child_of_parent(superds.path, self._get_tree_root()): # if the parent dataset is underneath the tree From 5c9ca960cba21ba35194650c24f39d75e757e6bc Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 7 Jul 2022 19:57:18 +0200 Subject: [PATCH 015/131] add test for normalization of root path --- datalad_next/tests/test_tree.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index b3c806cd..1e6fee49 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -251,6 +251,22 @@ def test_print_tree_with_params( assert_str_equal(expected_res, actual_res) +@pytest.mark.parametrize("root_dir_name", [ + "root/", "root/.", "root/./", "root/../root" +]) +def test_root_path_is_normalized(path, root_dir_name): + """ + Test that root path in the first line of string output + is normalized path + """ + root = os.path.join(path, root_dir_name) + tree = Tree(root, max_depth=0) + root_path = next(tree.print_line()) # first line of tree output + expected = os.path.join(path, "root") + actual = root_path + assert_str_equal(expected, actual) + + def test_print_tree_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" From 4a84ee697dcf455d2159198b191e26d7cb3d6d32 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 7 Jul 2022 23:47:58 +0200 Subject: [PATCH 016/131] fix incorrect string creation in to_string() --- datalad_next/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 5aa35d60..bac91925 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -324,7 +324,7 @@ def to_string(self): """Return complete tree as string""" if not self._lines: return "\n".join(list(self.print_line())) - return self._lines + return "\n".join(self._lines) def print_line(self): """Generator for tree output lines""" From 2518d635cde3bedcea123c221eeb30d36135497f Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 7 Jul 2022 23:48:28 +0200 Subject: [PATCH 017/131] add tests for trees with datasets --- datalad_next/tests/test_tree.py | 182 ++++++++++++++++++++++++++++---- 1 file changed, 159 insertions(+), 23 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 1e6fee49..8207050f 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,14 +1,13 @@ import os +from os.path import join as opj from datetime import datetime import pytest +from datalad.distribution.dataset import Dataset from datalad.tests.utils_pytest import ( - assert_in, - assert_not_in, assert_raises, - eq_, - neq_, - with_tree, assert_str_equal + assert_str_equal, + with_tree ) from datalad.utils import rmtemp @@ -79,6 +78,56 @@ def path(): assert not os.path.exists(temp_dir_root) +@pytest.fixture(scope="module") +def path_ds(): + """ + Fixture for temporary directory tree including nested + directories and datasets + """ + ds_tree = { + "root": { + "superds0": { + "sd0_file0": "", + "sd0_subds0": { + "sd0_sub0_subds0": {} + } + }, + "superds1": { + "sd1_file0": "", + "sd1_dir0": { + "sd1_d0_dir0": {}, + "sd1_d0_subds0": {}, + }, + "sd1_ds0": {}, # not registered as subdataset + "sd1_subds0": {}, # not installed (drop all) + }, + "dir0": { + "d0_file0": "" + }, + "file0": "", + } + } + + temp_dir_root = create_temp_dir_tree(ds_tree) + + # create datasets + root = opj(temp_dir_root, "root") + superds0 = Dataset(opj(root, "superds0")).create(force=True) + sd0_subds0 = superds0.create("sd0_subds0", force=True) + sd0_subds0.create("sd0_sub0_subds0", force=True) + superds1 = Dataset(opj(root, "superds1")).create(force=True) + superds1.create(opj("sd1_dir0", "sd1_d0_subds0"), force=True) + Dataset(opj(root, "superds1", "sd1_ds0")).create(force=True) + sd1_subds0 = superds1.create("sd1_subds0", force=True) + sd1_subds0.drop(what='all', reckless='kill', recursive=True) + + yield temp_dir_root + + # delete temp dir + rmtemp(temp_dir_root) + assert not os.path.exists(temp_dir_root) + + def format_param_ids(val): """Helper to format pytest parameter IDs. If the parameter is a multiline string, we assume it is the @@ -88,9 +137,9 @@ def format_param_ids(val): return "expected" -# Combinations of parameters to be tested and their expected results. +# combinations of parameters to be tested and their expected results. # (2 levels per param) ** (3 params) = 8 combinations + 8 expected results -param_combinations = [ +matrix_no_ds = [ { "depth": 1, "include_files": False, @@ -221,22 +270,80 @@ def format_param_ids(val): }, ] +# for trees with datasets, we test the dataset-specific options +matrix_ds = [ + { + "depth": 1, + "datasets_only": False, + "expected_stats_str": "", + "expected_str": """ +├── dir0/ +├── superds0/ [DS~0] +└── superds1/ [DS~0] +""", + }, + { + "depth": 4, + "datasets_only": False, + "expected_stats_str": "", + "expected_str": """ +├── dir0/ +├── superds0/ [DS~0] +| └── sd0_subds0/ [DS~1] +| └── sd0_sub0_subds0/ [DS~2] +└── superds1/ [DS~0] + ├── sd1_dir0/ + | ├── sd1_d0_dir0/ + | └── sd1_d0_subds0/ [DS~1] + ├── sd1_ds0/ [DS~0] + └── sd1_subds0/ [DS~1, not installed] +""", + }, + { + "depth": 1, + "datasets_only": True, + "expected_stats_str": "", + "expected_str": """ +├── superds0/ [DS~0] +└── superds1/ [DS~0] +""", + }, + { + "depth": 4, + "datasets_only": True, + "expected_stats_str": "", + "expected_str": """ +├── superds0/ [DS~0] +| └── sd0_subds0/ [DS~1] +| └── sd0_sub0_subds0/ [DS~2] +└── superds1/ [DS~0] + └── sd1_d0_subds0/ [DS~1] + ├── sd1_ds0/ [DS~0] + └── sd1_subds0/ [DS~1, not installed] +""", + }, +] + -def build_param_matrix(param_names): - matrix = [] - for combination in param_combinations: - matrix.append( +def build_param_matrix(matrix, params): + """Turn inner dicts into lists (required by pytest parametrize)""" + matrix_out = [] + for combination in matrix: + matrix_out.append( # order of combinations does not matter - [val for key, val in combination.items() if key in param_names] + [val for key, val in combination.items() if key in params] ) - return matrix + return matrix_out + + +param_names = ["depth", "include_files", "include_hidden", "expected_str"] @pytest.mark.parametrize( - ["depth", "include_files", "include_hidden", "expected_str"], - build_param_matrix(["depth", "include_files", "include_hidden", "expected_str"]), ids=format_param_ids + param_names, build_param_matrix(matrix_no_ds, param_names), + ids=format_param_ids ) -def test_print_tree_with_params( +def test_print_tree_with_params_no_ds( path, depth, include_files, include_hidden, expected_str ): root = os.path.join(path, "root") @@ -251,9 +358,9 @@ def test_print_tree_with_params( assert_str_equal(expected_res, actual_res) -@pytest.mark.parametrize("root_dir_name", [ - "root/", "root/.", "root/./", "root/../root" -]) +@pytest.mark.parametrize( + "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] +) def test_root_path_is_normalized(path, root_dir_name): """ Test that root path in the first line of string output @@ -267,7 +374,7 @@ def test_root_path_is_normalized(path, root_dir_name): assert_str_equal(expected, actual) -def test_print_tree_for_nonexistent_directory(): +def test_print_tree_fails_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" dir_name = f"to_be_deleted_{datetime.now().timestamp()}" @@ -276,11 +383,12 @@ def test_print_tree_for_nonexistent_directory(): Tree(nonexistent_dir, max_depth=1) +param_names = ["depth", "include_files", "include_hidden", "expected_stats_str"] + @pytest.mark.parametrize( - ["depth", "include_files", "include_hidden", "expected_stats_str"], - build_param_matrix(["depth", "include_files", "include_hidden", "expected_stats_str"]) + param_names, build_param_matrix(matrix_no_ds, param_names) ) -def test_tree_stats( +def test_print_stats( path, depth, include_files, include_hidden, expected_stats_str ): root = os.path.join(path, 'root') @@ -290,3 +398,31 @@ def test_tree_stats( actual_res = tree.stats() expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) + + +def test_tree_to_string(path): + root = os.path.join(path, 'root') + tree = Tree(root, 3) + actual = tree.to_string() + expected = "\n".join(tree._lines) + assert_str_equal(expected, actual) + + +param_names = ["depth", "datasets_only", "expected_str"] + + +@pytest.mark.parametrize( + param_names, build_param_matrix(matrix_ds, param_names), + ids=format_param_ids +) +def test_print_tree_with_params_with_ds( + path_ds, depth, datasets_only, expected_str +): + root = os.path.join(path_ds, "root") + tree = Tree(root, max_depth=depth, datasets_only=datasets_only) + # skip the first line with the root directory + # as we will test it separately + lines = (l for i, l in enumerate(tree.print_line()) if i > 0) + actual_res = "\n".join(lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline + assert_str_equal(expected_res, actual_res) From 17d6091f7c7634f5c7e6bbe9bb2a63f7737dc22c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 00:38:44 +0200 Subject: [PATCH 018/131] reinstate --full-paths param (useful in combination with --datasets-only) --- datalad_next/tests/test_tree.py | 22 ++++++++--- datalad_next/tree.py | 65 ++++++++++++++++++++++++--------- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 8207050f..7b22c83c 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -7,7 +7,7 @@ from datalad.tests.utils_pytest import ( assert_raises, assert_str_equal, - with_tree + with_tree, assert_re_in ) from datalad.utils import rmtemp @@ -352,8 +352,9 @@ def test_print_tree_with_params_no_ds( include_files=include_files, include_hidden=include_hidden) # skip the first line with the root directory # as we will test it separately - lines = (l for i, l in enumerate(tree.print_line()) if i > 0) - actual_res = "\n".join(lines) + "\n" + lines = tree.print_line() + next(lines) # skip the first line (root dir) + actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline assert_str_equal(expected_res, actual_res) @@ -422,7 +423,18 @@ def test_print_tree_with_params_with_ds( tree = Tree(root, max_depth=depth, datasets_only=datasets_only) # skip the first line with the root directory # as we will test it separately - lines = (l for i, l in enumerate(tree.print_line()) if i > 0) - actual_res = "\n".join(lines) + "\n" + lines = tree.print_line() + next(lines) # skip the first line (root dir) + actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline assert_str_equal(expected_res, actual_res) + + +def test_print_tree_full_paths(): + # run in the cwd so detecting full paths is easier + tree = Tree('.', max_depth=1, full_paths=True) + # get the second line (first child, hopefully exists) + lines = tree.print_line() + next(lines) # skip the first line (root dir) + first_child = next(lines) + assert_re_in(r"(?:└──|├──) \./", first_child) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index bac91925..b3efe29a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -101,15 +101,19 @@ class TreeCommand(Interface): constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), datasets_only=Parameter( args=("--datasets-only",), - doc="""whether to only list directories that are datasets""", + doc="""only list directories that are datasets""", action='store_true'), include_files=Parameter( args=("--include-files",), - doc="""whether to include files in output display""", + doc="""include files in output display""", action='store_true'), include_hidden=Parameter( args=("-a", "--include-hidden",), - doc="""whether to include hidden files/directories in output display""", + doc="""include hidden files/directories in output""", + action='store_true'), + full_paths=Parameter( + args=("--full-paths",), + doc="""display the full path for files/directories""", action='store_true'), ) @@ -122,8 +126,8 @@ class TreeCommand(Interface): dict(text="List all first- and second-level subdatasets " "of parent datasets located anywhere under /tmp, " "regardless of directory depth", - code_py="tree('/tmp', dataset_depth=2, datasets_only=Truec)", - code_cmd="datalad tree /tmp -R 2 --datasets-only"), + code_py="tree('/tmp', dataset_depth=2, datasets_only=True, full_paths=True)", + code_cmd="datalad tree /tmp -R 2 --datasets-only --full-paths"), dict(text="Display first- and second-level subdatasets and their" "contents up to 3 directories deep (within each subdataset)", code_py="tree('.', dataset_depth=2, directory_depth=1)", @@ -133,14 +137,26 @@ class TreeCommand(Interface): @staticmethod @datasetmethod(name='tree') @eval_results - def __call__(path='.', *, depth=1, dataset_depth=None, - datasets_only=False, include_files=False, include_hidden=False): - + def __call__( + path='.', + *, + depth=1, + dataset_depth=None, + datasets_only=False, + include_files=False, + include_hidden=False, + full_paths=False, + ): # print tree output tree = Tree( - path, depth, dataset_max_depth=dataset_depth, + path, + depth, + dataset_max_depth=dataset_depth, datasets_only=datasets_only, - include_files=include_files, include_hidden=include_hidden) + include_files=include_files, + include_hidden=include_hidden, + full_paths=full_paths + ) for line in tree.print_line(): # print one line at a time to improve perceived speed @@ -190,7 +206,7 @@ class Tree(object): def __init__(self, root: str, max_depth: int, dataset_max_depth=None, datasets_only=False, include_files=False, - include_hidden=False): + include_hidden=False, full_paths=False): # TODO: validate parameters if not os.path.isdir(root): @@ -201,6 +217,7 @@ def __init__(self, root: str, max_depth: int, dataset_max_depth=None, self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden + self.full_paths = full_paths self._lines = [] # holds the list of lines of output string self._last_children = [] # TODO: stats should automatically register all concrete _TreeNode classes @@ -234,6 +251,8 @@ def stats(self): """ Equivalent of tree command's 'report line' at the end of the tree output. + The 3 node types (directory, dataset, file) are mutually exclusive, + so their total is the total count of nodes. Only counts contents below the root directory, does not count the root itself. """ @@ -287,8 +306,9 @@ def _generate_nodes(self): current_depth = self._current_depth(path) # handle directories/datasets - dir_or_ds = DirectoryOrDatasetNode(path, current_depth, - self._is_last_child(path)) + dir_or_ds = DirectoryOrDatasetNode( + path, current_depth, self._is_last_child(path), self.full_paths + ) if current_depth == 0 or \ not self.datasets_only or \ self.datasets_only and isinstance(dir_or_ds, DatasetNode): @@ -298,8 +318,10 @@ def _generate_nodes(self): if self.include_files: for file in files: file_path = os.path.join(path, file) - yield FileNode(file_path, current_depth + 1, - self._is_last_child(file_path)) + yield FileNode( + file_path, current_depth + 1, + self._is_last_child(file_path), self.full_paths + ) if self._is_max_depth_reached(path): # generate any remaining directory/dataset nodes, @@ -309,7 +331,8 @@ def _generate_nodes(self): dir_or_ds = DirectoryOrDatasetNode( dir_path, current_depth + 1, - self._is_last_child(dir_path)) + self._is_last_child(dir_path), self.full_paths + ) if not self.datasets_only or \ self.datasets_only and isinstance(dir_or_ds, @@ -387,13 +410,19 @@ class _TreeNode(object): tree node and printed as single line of the 'tree' output. """ - def __init__(self, path: str, depth: int, is_last_child): + def __init__(self, path: str, depth: int, is_last_child: bool, + use_full_paths=False): self.path = path self.depth = depth # depth in the directory tree self.is_last_child = is_last_child # if it is last item of its subtree + self.use_full_paths = use_full_paths def __str__(self): - path = os.path.basename(self.path) if self.depth > 0 else self.path + if self.depth == 0 or self.use_full_paths: + path = self.path + else: + path = os.path.basename(self.path) + prefix = "" if self.depth > 0: prefix = "└── " if self.is_last_child else "├── " From deb06e3a19340cec78bd49d800211cebfd7bc5b5 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 00:54:52 +0200 Subject: [PATCH 019/131] support color terminal output for directories and dataset paths --- datalad_next/tree.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index b3efe29a..379b3abd 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -70,6 +70,7 @@ EnsureNone, EnsureStr, EnsureInt, EnsureBool, EnsureRange, Constraints, ) +from datalad.support import ansi_colors lgr = logging.getLogger('datalad.local.tree') @@ -409,6 +410,7 @@ class _TreeNode(object): Base class for a directory or file represented as a single tree node and printed as single line of the 'tree' output. """ + COLOR = None # ANSI color for the path, if terminal color are enabled def __init__(self, path: str, depth: int, is_last_child: bool, use_full_paths=False): @@ -423,6 +425,9 @@ def __str__(self): else: path = os.path.basename(self.path) + if self.COLOR is not None: + path = ansi_colors.color_word(path, self.COLOR) + prefix = "" if self.depth > 0: prefix = "└── " if self.is_last_child else "├── " @@ -438,6 +443,8 @@ def _get_tree_root(self): class DirectoryNode(_TreeNode): + COLOR = ansi_colors.BLUE + def __str__(self): string = super().__str__() if self.depth > 0: @@ -480,6 +487,8 @@ def is_dataset(path): class DatasetNode(DirectoryNode): + COLOR = ansi_colors.MAGENTA + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 732660385849ff15b8f03a6f2b374ccb8c4c8a74 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 01:16:54 +0200 Subject: [PATCH 020/131] allow --depth=0 (useful in combination with --dataset-depth) --- datalad_next/tests/test_tree.py | 9 +++++++++ datalad_next/tree.py | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 7b22c83c..a3354723 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -438,3 +438,12 @@ def test_print_tree_full_paths(): next(lines) # skip the first line (root dir) first_child = next(lines) assert_re_in(r"(?:└──|├──) \./", first_child) + + +def test_print_tree_depth_zero(path): + root = os.path.join(path, "root") + tree = Tree(root, max_depth=0, + include_files=True) # should have no effect + actual = tree.to_string() + expected = root + assert_str_equal(expected, actual) \ No newline at end of file diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 379b3abd..92d87c14 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -94,7 +94,8 @@ class TreeCommand(Interface): constraints=EnsureStr() | EnsureNone()), depth=Parameter( args=("-L", "--depth",), - doc="""maximum level of directory tree to display""", + doc="""maximum level of directory tree to display. + If not specified, will display all levels.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("-R", "--dataset-depth",), @@ -141,7 +142,7 @@ class TreeCommand(Interface): def __call__( path='.', *, - depth=1, + depth=None, dataset_depth=None, datasets_only=False, include_files=False, @@ -151,7 +152,7 @@ def __call__( # print tree output tree = Tree( path, - depth, + max_depth=depth, dataset_max_depth=dataset_depth, datasets_only=datasets_only, include_files=include_files, @@ -205,7 +206,7 @@ class Tree(object): of the whole tree and the statistics (counts of different node types). """ - def __init__(self, root: str, max_depth: int, dataset_max_depth=None, + def __init__(self, root: str, max_depth=None, dataset_max_depth=None, datasets_only=False, include_files=False, include_hidden=False, full_paths=False): @@ -213,12 +214,17 @@ def __init__(self, root: str, max_depth: int, dataset_max_depth=None, if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") self.root = os.path.normpath(root) + self.max_depth = max_depth + if max_depth is not None and max_depth < 0: + raise ValueError("max_depth must be >= 0") + self.dataset_max_depth = dataset_max_depth self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden self.full_paths = full_paths + self._lines = [] # holds the list of lines of output string self._last_children = [] # TODO: stats should automatically register all concrete _TreeNode classes @@ -243,7 +249,9 @@ def _is_max_depth_reached(self, path): right below the current level. Therefore, we 'reach' when we get to 1 level *before* max_depth. """ - return self._current_depth(path) == self.max_depth - 1 + if self.max_depth is not None: + return self._current_depth(path) == self.max_depth - 1 + return False # unlimited depth def _is_max_dataset_depth_reached(self, path): pass @@ -315,6 +323,9 @@ def _generate_nodes(self): self.datasets_only and isinstance(dir_or_ds, DatasetNode): yield dir_or_ds + if self.max_depth == 0: + break # just yield the root dir and exit + # handle files if self.include_files: for file in files: From 888dfe5aa6525571c1a429b19ccb2ef15364e8d1 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 01:24:03 +0200 Subject: [PATCH 021/131] clean up comment, removed unused imports --- datalad_next/tree.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 92d87c14..391c32d3 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -42,7 +42,6 @@ __docformat__ = 'restructuredtext' -import json import logging import os from functools import wraps @@ -51,11 +50,9 @@ Interface, build_doc, ) -from datalad.support.exceptions import CapturedException, NoDatasetFound from datalad.support.param import Parameter from datalad.distribution.dataset import ( datasetmethod, - EnsureDataset, require_dataset, ) from datalad.interface.results import ( @@ -63,12 +60,10 @@ ) from datalad.interface.utils import ( eval_results, - generic_result_renderer, ) from datalad.support.constraints import ( - EnsureChoice, EnsureNone, - EnsureStr, EnsureInt, EnsureBool, EnsureRange, Constraints, + EnsureStr, EnsureInt, EnsureRange, ) from datalad.support import ansi_colors @@ -149,7 +144,6 @@ def __call__( include_hidden=False, full_paths=False, ): - # print tree output tree = Tree( path, max_depth=depth, @@ -161,7 +155,7 @@ def __call__( ) for line in tree.print_line(): - # print one line at a time to improve perceived speed + # print one line at a time to improve UX / perceived speed print(line) print("\n" + tree.stats() + "\n") @@ -175,7 +169,8 @@ def __call__( def increment_node_count(node_generator_func): """ - Decorator for incrementing the node count whenever a ``_TreeNode`` is yielded. + Decorator for incrementing the node count whenever + a ``_TreeNode`` is generated. """ @wraps(node_generator_func) def _wrapper(*args, **kwargs): @@ -183,7 +178,9 @@ def _wrapper(*args, **kwargs): for node in node_generator_func(*args, **kwargs): node_type = node.__class__.__name__ if node_type not in self._stats: - raise ValueError(f"No stats collected for unknown node type '{node_type}'") + raise ValueError( + f"No stats collected for unknown node type '{node_type}'" + ) if node.depth > 0: # we do not count the root directory self._stats[node_type] += 1 @@ -210,7 +207,6 @@ def __init__(self, root: str, max_depth=None, dataset_max_depth=None, datasets_only=False, include_files=False, include_hidden=False, full_paths=False): - # TODO: validate parameters if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") self.root = os.path.normpath(root) @@ -225,8 +221,8 @@ def __init__(self, root: str, max_depth=None, dataset_max_depth=None, self.include_hidden = include_hidden self.full_paths = full_paths - self._lines = [] # holds the list of lines of output string - self._last_children = [] + self._lines = [] # list of lines of output string + self._last_children = [] # last child of each subtree # TODO: stats should automatically register all concrete _TreeNode classes self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} From f2eb743fcd2768eb50b7beebfee2bb9e0aecfe11 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 10:58:39 +0200 Subject: [PATCH 022/131] add tests for stats with datasets --- datalad_next/tests/test_tree.py | 61 +++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index a3354723..5ae91af3 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -39,7 +39,7 @@ def create_temp_dir_tree(tree_dict): @pytest.fixture(scope="module") -def path(): +def path_no_ds(): """ Fixture for temporary directory tree including nested directories, without datasets @@ -275,7 +275,7 @@ def format_param_ids(val): { "depth": 1, "datasets_only": False, - "expected_stats_str": "", + "expected_stats_str": "1 directories, 2 datasets, 0 files", "expected_str": """ ├── dir0/ ├── superds0/ [DS~0] @@ -285,7 +285,7 @@ def format_param_ids(val): { "depth": 4, "datasets_only": False, - "expected_stats_str": "", + "expected_stats_str": "3 directories, 7 datasets, 0 files", "expected_str": """ ├── dir0/ ├── superds0/ [DS~0] @@ -302,7 +302,7 @@ def format_param_ids(val): { "depth": 1, "datasets_only": True, - "expected_stats_str": "", + "expected_stats_str": "0 directories, 2 datasets, 0 files", "expected_str": """ ├── superds0/ [DS~0] └── superds1/ [DS~0] @@ -311,7 +311,7 @@ def format_param_ids(val): { "depth": 4, "datasets_only": True, - "expected_stats_str": "", + "expected_stats_str": "0 directories, 7 datasets, 0 files", "expected_str": """ ├── superds0/ [DS~0] | └── sd0_subds0/ [DS~1] @@ -336,6 +336,8 @@ def build_param_matrix(matrix, params): return matrix_out +# ================== Test directory tree without datasets ================== + param_names = ["depth", "include_files", "include_hidden", "expected_str"] @@ -344,9 +346,9 @@ def build_param_matrix(matrix, params): ids=format_param_ids ) def test_print_tree_with_params_no_ds( - path, depth, include_files, include_hidden, expected_str + path_no_ds, depth, include_files, include_hidden, expected_str ): - root = os.path.join(path, "root") + root = os.path.join(path_no_ds, "root") tree = Tree( root, max_depth=depth, include_files=include_files, include_hidden=include_hidden) @@ -362,15 +364,15 @@ def test_print_tree_with_params_no_ds( @pytest.mark.parametrize( "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] ) -def test_root_path_is_normalized(path, root_dir_name): +def test_root_path_is_normalized(path_no_ds, root_dir_name): """ Test that root path in the first line of string output is normalized path """ - root = os.path.join(path, root_dir_name) + root = os.path.join(path_no_ds, root_dir_name) tree = Tree(root, max_depth=0) root_path = next(tree.print_line()) # first line of tree output - expected = os.path.join(path, "root") + expected = os.path.join(path_no_ds, "root") actual = root_path assert_str_equal(expected, actual) @@ -389,26 +391,29 @@ def test_print_tree_fails_for_nonexistent_directory(): @pytest.mark.parametrize( param_names, build_param_matrix(matrix_no_ds, param_names) ) -def test_print_stats( - path, depth, include_files, include_hidden, expected_stats_str +def test_print_stats_no_ds( + path_no_ds, depth, include_files, include_hidden, expected_stats_str ): - root = os.path.join(path, 'root') + root = os.path.join(path_no_ds, 'root') tree = Tree( root, max_depth=depth, - include_files=include_files, include_hidden=include_hidden).build() + include_files=include_files, include_hidden=include_hidden + ).build() actual_res = tree.stats() expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) -def test_tree_to_string(path): - root = os.path.join(path, 'root') +def test_tree_to_string(path_no_ds): + root = os.path.join(path_no_ds, 'root') tree = Tree(root, 3) actual = tree.to_string() expected = "\n".join(tree._lines) assert_str_equal(expected, actual) +# ================== Test directory tree with datasets ================== + param_names = ["depth", "datasets_only", "expected_str"] @@ -430,6 +435,24 @@ def test_print_tree_with_params_with_ds( assert_str_equal(expected_res, actual_res) +param_names = ["depth", "datasets_only", "expected_stats_str"] + + +@pytest.mark.parametrize( + param_names, build_param_matrix(matrix_ds, param_names) +) +def test_print_stats_with_ds( + path_ds, depth, datasets_only, expected_stats_str +): + root = os.path.join(path_ds, 'root') + tree = Tree( + root, max_depth=depth, datasets_only=datasets_only + ).build() + actual_res = tree.stats() + expected_res = expected_stats_str + assert_str_equal(expected_res, actual_res) + + def test_print_tree_full_paths(): # run in the cwd so detecting full paths is easier tree = Tree('.', max_depth=1, full_paths=True) @@ -440,10 +463,10 @@ def test_print_tree_full_paths(): assert_re_in(r"(?:└──|├──) \./", first_child) -def test_print_tree_depth_zero(path): - root = os.path.join(path, "root") +def test_print_tree_depth_zero(path_no_ds): + root = os.path.join(path_no_ds, "root") tree = Tree(root, max_depth=0, include_files=True) # should have no effect actual = tree.to_string() expected = root - assert_str_equal(expected, actual) \ No newline at end of file + assert_str_equal(expected, actual) From 4691966f0d70ef6fd5f1b09374a7cdfca19ea4c6 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 16:23:05 +0200 Subject: [PATCH 023/131] remove failing tests for --datasets-only (impl needs rework) --- datalad_next/tests/test_tree.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 5ae91af3..0bac9400 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -297,29 +297,6 @@ def format_param_ids(val): | └── sd1_d0_subds0/ [DS~1] ├── sd1_ds0/ [DS~0] └── sd1_subds0/ [DS~1, not installed] -""", - }, - { - "depth": 1, - "datasets_only": True, - "expected_stats_str": "0 directories, 2 datasets, 0 files", - "expected_str": """ -├── superds0/ [DS~0] -└── superds1/ [DS~0] -""", - }, - { - "depth": 4, - "datasets_only": True, - "expected_stats_str": "0 directories, 7 datasets, 0 files", - "expected_str": """ -├── superds0/ [DS~0] -| └── sd0_subds0/ [DS~1] -| └── sd0_sub0_subds0/ [DS~2] -└── superds1/ [DS~0] - └── sd1_d0_subds0/ [DS~1] - ├── sd1_ds0/ [DS~0] - └── sd1_subds0/ [DS~1, not installed] """, }, ] From 5fd855d27e2a70b3ef20ad67a8f507f36ad88aa7 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 8 Jul 2022 16:24:45 +0200 Subject: [PATCH 024/131] fix false-positive detection of datasets on pure git repos --- datalad_next/tests/test_tree.py | 24 +++++++++++++++++------- datalad_next/tree.py | 13 +++++++++++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 0bac9400..18d64073 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -4,6 +4,7 @@ import pytest from datalad.distribution.dataset import Dataset +from datalad.tests.test_utils_testrepos import BasicGitTestRepo from datalad.tests.utils_pytest import ( assert_raises, assert_str_equal, @@ -95,14 +96,19 @@ def path_ds(): "superds1": { "sd1_file0": "", "sd1_dir0": { - "sd1_d0_dir0": {}, + "sd1_d0_repo0": { + "INFO.txt": "", + "test.dat": "" + }, "sd1_d0_subds0": {}, }, "sd1_ds0": {}, # not registered as subdataset "sd1_subds0": {}, # not installed (drop all) }, - "dir0": { - "d0_file0": "" + # plain git repo (contents are defined in BasicGitTestRepo) + "repo0": { + "INFO.txt": "", + "test.dat": "" }, "file0": "", } @@ -110,14 +116,18 @@ def path_ds(): temp_dir_root = create_temp_dir_tree(ds_tree) - # create datasets + # create datasets / repos root = opj(temp_dir_root, "root") + BasicGitTestRepo(path=opj(root, "repo0"), puke_if_exists=False) superds0 = Dataset(opj(root, "superds0")).create(force=True) sd0_subds0 = superds0.create("sd0_subds0", force=True) sd0_subds0.create("sd0_sub0_subds0", force=True) superds1 = Dataset(opj(root, "superds1")).create(force=True) superds1.create(opj("sd1_dir0", "sd1_d0_subds0"), force=True) Dataset(opj(root, "superds1", "sd1_ds0")).create(force=True) + BasicGitTestRepo( + path=opj(root, "superds1", "sd1_dir0", "sd1_d0_repo0"), + puke_if_exists=False) sd1_subds0 = superds1.create("sd1_subds0", force=True) sd1_subds0.drop(what='all', reckless='kill', recursive=True) @@ -277,7 +287,7 @@ def format_param_ids(val): "datasets_only": False, "expected_stats_str": "1 directories, 2 datasets, 0 files", "expected_str": """ -├── dir0/ +├── repo0/ ├── superds0/ [DS~0] └── superds1/ [DS~0] """, @@ -287,13 +297,13 @@ def format_param_ids(val): "datasets_only": False, "expected_stats_str": "3 directories, 7 datasets, 0 files", "expected_str": """ -├── dir0/ +├── repo0/ ├── superds0/ [DS~0] | └── sd0_subds0/ [DS~1] | └── sd0_sub0_subds0/ [DS~2] └── superds1/ [DS~0] ├── sd1_dir0/ - | ├── sd1_d0_dir0/ + | ├── sd1_d0_repo0/ | └── sd1_d0_subds0/ [DS~1] ├── sd1_ds0/ [DS~0] └── sd1_subds0/ [DS~1, not installed] diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 391c32d3..551527b2 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -482,15 +482,24 @@ def is_dataset(path): We infer that a directory is a dataset if it is either: (A) installed, or (B) not installed, but it has an installed superdatset. + Only consider datalad datasets, not plain git/git-annex repos. """ ds = require_dataset(path, check_installed=False) - if ds.is_installed(): + + if ds.id is not None: return True # check if it has an installed superdataset superds = ds.get_superdataset(datalad_only=True, topmost=False, registered_only=True) - return superds is not None + if superds is not None: + return True + + # if it has no dataset ID, it's just a plain repo + # (or, it is a datalad dataset that is not installed + # and has no parent dataset -- we have no way to detect + # these, or?) + return False class DatasetNode(DirectoryNode): From 6401f556b48c27c41049558be8bd02b87bf7238b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 11 Jul 2022 11:12:11 +0200 Subject: [PATCH 025/131] improve efficiency of dataset detection --- datalad_next/tree.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 551527b2..2fba75f6 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -486,19 +486,31 @@ def is_dataset(path): """ ds = require_dataset(path, check_installed=False) - if ds.id is not None: + # detect if it is an installed datalad-proper dataset + # (as opposed to git/git-annex repo). + # could also query `ds.id`, but checking just for existence + # of config file is quicker. + if os.path.isfile(os.path.join(ds.path, ".datalad", "config")): return True - # check if it has an installed superdataset - superds = ds.get_superdataset(datalad_only=True, topmost=False, - registered_only=True) - if superds is not None: + # if it is not installed, check if it has an installed superdataset. + # instead of querying ds.is_installed() (which checks if the + # directory has the .git folder), we check if the directory + # is empty (faster) -- as e.g. after a non-recursive `datalad clone` + def is_empty_dir(): + with os.scandir(path) as contents: + if any(contents): + return False return True - # if it has no dataset ID, it's just a plain repo - # (or, it is a datalad dataset that is not installed - # and has no parent dataset -- we have no way to detect - # these, or?) + if is_empty_dir(): + superds = ds.get_superdataset(datalad_only=True, topmost=False, + registered_only=True) + if superds is not None: + return True + + # TODO: do we have a way to detect a datalad dataset if it + # is not installed and it is not a subdataset? return False From 364a9254dfbb2f6efce989952a1c5570392d61ca Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 12 Jul 2022 00:11:09 +0200 Subject: [PATCH 026/131] remove argument --datasets-only, to be replaced with dataset subtree depth logic --- datalad_next/tests/test_tree.py | 22 ++++++++----------- datalad_next/tree.py | 39 +++++++++++---------------------- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 18d64073..c7ab1f87 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -284,7 +284,6 @@ def format_param_ids(val): matrix_ds = [ { "depth": 1, - "datasets_only": False, "expected_stats_str": "1 directories, 2 datasets, 0 files", "expected_str": """ ├── repo0/ @@ -294,7 +293,6 @@ def format_param_ids(val): }, { "depth": 4, - "datasets_only": False, "expected_stats_str": "3 directories, 7 datasets, 0 files", "expected_str": """ ├── repo0/ @@ -401,7 +399,7 @@ def test_tree_to_string(path_no_ds): # ================== Test directory tree with datasets ================== -param_names = ["depth", "datasets_only", "expected_str"] +param_names = ["depth", "expected_str"] @pytest.mark.parametrize( @@ -409,32 +407,30 @@ def test_tree_to_string(path_no_ds): ids=format_param_ids ) def test_print_tree_with_params_with_ds( - path_ds, depth, datasets_only, expected_str + path_ds, depth, expected_str ): root = os.path.join(path_ds, "root") - tree = Tree(root, max_depth=depth, datasets_only=datasets_only) - # skip the first line with the root directory - # as we will test it separately + tree = Tree( + root, max_depth=depth, + skip_root=True # skip the first line with the root directory + ) lines = tree.print_line() - next(lines) # skip the first line (root dir) actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline assert_str_equal(expected_res, actual_res) -param_names = ["depth", "datasets_only", "expected_stats_str"] +param_names = ["depth", "expected_stats_str"] @pytest.mark.parametrize( param_names, build_param_matrix(matrix_ds, param_names) ) def test_print_stats_with_ds( - path_ds, depth, datasets_only, expected_stats_str + path_ds, depth, expected_stats_str ): root = os.path.join(path_ds, 'root') - tree = Tree( - root, max_depth=depth, datasets_only=datasets_only - ).build() + tree = Tree(root, max_depth=depth).build() actual_res = tree.stats() expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 2fba75f6..67adae3b 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -96,10 +96,6 @@ class TreeCommand(Interface): args=("-R", "--dataset-depth",), doc="""maximum level of nested subdatasets to display""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), - datasets_only=Parameter( - args=("--datasets-only",), - doc="""only list directories that are datasets""", - action='store_true'), include_files=Parameter( args=("--include-files",), doc="""include files in output display""", @@ -123,8 +119,8 @@ class TreeCommand(Interface): dict(text="List all first- and second-level subdatasets " "of parent datasets located anywhere under /tmp, " "regardless of directory depth", - code_py="tree('/tmp', dataset_depth=2, datasets_only=True, full_paths=True)", - code_cmd="datalad tree /tmp -R 2 --datasets-only --full-paths"), + code_py="tree('/tmp', dataset_depth=2, depth=0, full_paths=True)", + code_cmd="datalad tree /tmp -R 2 -L 0 --full-paths"), dict(text="Display first- and second-level subdatasets and their" "contents up to 3 directories deep (within each subdataset)", code_py="tree('.', dataset_depth=2, directory_depth=1)", @@ -139,7 +135,6 @@ def __call__( *, depth=None, dataset_depth=None, - datasets_only=False, include_files=False, include_hidden=False, full_paths=False, @@ -203,9 +198,9 @@ class Tree(object): of the whole tree and the statistics (counts of different node types). """ - def __init__(self, root: str, max_depth=None, dataset_max_depth=None, - datasets_only=False, include_files=False, - include_hidden=False, full_paths=False): + def __init__(self, root: str, max_depth=None, + include_files=False, include_hidden=False, + full_paths=False, skip_root=False): if not os.path.isdir(root): raise ValueError(f"directory '{root}' not found") @@ -216,7 +211,6 @@ def __init__(self, root: str, max_depth=None, dataset_max_depth=None, raise ValueError("max_depth must be >= 0") self.dataset_max_depth = dataset_max_depth - self.datasets_only = datasets_only self.include_files = include_files self.include_hidden = include_hidden self.full_paths = full_paths @@ -312,14 +306,11 @@ def _generate_nodes(self): # handle directories/datasets dir_or_ds = DirectoryOrDatasetNode( - path, current_depth, self._is_last_child(path), self.full_paths - ) - if current_depth == 0 or \ - not self.datasets_only or \ - self.datasets_only and isinstance(dir_or_ds, DatasetNode): - yield dir_or_ds - - if self.max_depth == 0: + path, current_depth, self._is_last_child(path), + use_full_paths=self.full_paths) + yield dir_or_ds + + if self.max_depth == 0 and current_depth == 0: break # just yield the root dir and exit # handle files @@ -337,16 +328,12 @@ def _generate_nodes(self): for child_dir in dirs: dir_path = os.path.join(path, child_dir) - dir_or_ds = DirectoryOrDatasetNode( + yield DirectoryOrDatasetNode( dir_path, current_depth + 1, - self._is_last_child(dir_path), self.full_paths + self._is_last_child(dir_path), + use_full_paths=self.full_paths ) - if not self.datasets_only or \ - self.datasets_only and isinstance(dir_or_ds, - DatasetNode): - yield dir_or_ds - # empty in-place the list of next directories to # traverse, which effectively stops os.walk's walking dirs[:] = [] From 00cbab96ec0c5400a2620a529fa2c8608367e9a1 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 12 Jul 2022 15:25:47 +0200 Subject: [PATCH 027/131] remove redundant logic in directory walk --- datalad_next/tree.py | 45 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 67adae3b..5a928077 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -232,20 +232,10 @@ def _is_last_child(self, path): return path in self._last_children def _is_max_depth_reached(self, path): - """ - If max depth is reached, it means we will not traverse - any further directories in the next iteration. - However, we will still list any directories or files - right below the current level. - Therefore, we 'reach' when we get to 1 level *before* max_depth. - """ if self.max_depth is not None: - return self._current_depth(path) == self.max_depth - 1 + return self._current_depth(path) == self.max_depth return False # unlimited depth - def _is_max_dataset_depth_reached(self, path): - pass - def stats(self): """ Equivalent of tree command's 'report line' at the end of the @@ -272,19 +262,29 @@ def build(self): @increment_node_count def _generate_nodes(self): """ - Yields _TreeNode objects, each representing a directory, dataset - or file. Nodes are traversed in depth-first order. + Yields ``_TreeNode`` objects, each representing a directory or + dataset or file. Nodes are traversed in depth-first order. """ # os.walk() does depth-first traversal for path, dirs, files in os.walk(self.root): + if self._is_max_depth_reached(path): + # empty in-place the list of next directories to + # traverse, which effectively stops os.walk's walking + dirs[:] = [] + files[:] = [] + # modify os.walk()'s output in-place to prevent - # traversal into those directories + # traversal into selected directories if not self.include_hidden: dirs[:] = [d for d in dirs if not d.startswith(".")] files[:] = [f for f in files if not f.startswith(".")] + if self.exclude_datasets: + dirs[:] = [d for d in dirs + if not is_dataset(os.path.join(path, d))] + # sort directories and files alphabetically in-place. # note that directories and files are sorted separately. # files are all listed before the directories @@ -319,25 +319,10 @@ def _generate_nodes(self): file_path = os.path.join(path, file) yield FileNode( file_path, current_depth + 1, - self._is_last_child(file_path), self.full_paths - ) - - if self._is_max_depth_reached(path): - # generate any remaining directory/dataset nodes, - # which will not be traversed in the next iteration - for child_dir in dirs: - dir_path = os.path.join(path, child_dir) - - yield DirectoryOrDatasetNode( - dir_path, current_depth + 1, - self._is_last_child(dir_path), + self._is_last_child(file_path), use_full_paths=self.full_paths ) - # empty in-place the list of next directories to - # traverse, which effectively stops os.walk's walking - dirs[:] = [] - def to_string(self): """Return complete tree as string""" if not self._lines: From 9cb0baf9d26efb15ff3a45b807523c61abc77f00 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 12 Jul 2022 18:18:55 +0200 Subject: [PATCH 028/131] store last children as set for faster is_last_child check --- datalad_next/tree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 5a928077..fc30ffb3 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -216,7 +216,7 @@ def __init__(self, root: str, max_depth=None, self.full_paths = full_paths self._lines = [] # list of lines of output string - self._last_children = [] # last child of each subtree + self._last_children = set([]) # last child of each subtree # TODO: stats should automatically register all concrete _TreeNode classes self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} @@ -298,10 +298,9 @@ def _generate_nodes(self): # files are listed first, directories come last. # so we take the last subdirectory if it exists, # otherwise the last file. - self._last_children.append( + self._last_children.add( os.path.join(path, dirs[-1] if dirs else files[-1]) ) - current_depth = self._current_depth(path) # handle directories/datasets From 5e6511053e2d57e7e030eb3a720203faccfc0a46 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 12 Jul 2022 18:21:06 +0200 Subject: [PATCH 029/131] extract is_dataset() into function outside class for easier reuse, add cache --- datalad_next/tree.py | 86 ++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index fc30ffb3..38263992 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -44,7 +44,7 @@ import logging import os -from functools import wraps +from functools import wraps, lru_cache from datalad.interface.base import ( Interface, @@ -184,6 +184,47 @@ def _wrapper(*args, **kwargs): return _wrapper +# whether path is a dataset should not change within +# command execution time, so we cache it +@lru_cache +def is_dataset(path): + """ + Fast dataset detection. + Infer that a directory is a dataset if it is either: + (A) installed, or + (B) not installed, but has an installed superdatset. + Only consider datalad datasets, not plain git/git-annex repos. + """ + ds = require_dataset(path, check_installed=False) + + # detect if it is an installed datalad-proper dataset + # (as opposed to git/git-annex repo). + # could also query `ds.id`, but checking just for existence + # of config file is quicker. + if os.path.isfile(os.path.join(ds.path, ".datalad", "config")): + return True + + # if it is not installed, check if it has an installed superdataset. + # instead of querying ds.is_installed() (which checks if the + # directory has the .git folder), we check if the directory + # is empty (faster) -- as e.g. after a non-recursive `datalad clone` + def is_empty_dir(): + with os.scandir(path) as contents: + if any(contents): + return False + return True + + if is_empty_dir(): + superds = ds.get_superdataset(datalad_only=True, topmost=False, + registered_only=True) + if superds is not None: + return True + + # TODO: do we have a way to detect a datalad dataset if it + # is not installed and it is not a subdataset? + return False + + def is_path_child_of_parent(child, parent): parent_abs = os.path.abspath(parent) child_abs = os.path.abspath(child) @@ -440,50 +481,11 @@ class DirectoryOrDatasetNode(_TreeNode): based on whether the current path is a dataset or not. """ def __new__(cls, path, *args, **kwargs): - if cls.is_dataset(path): - ds_node = DatasetNode(path, *args, **kwargs) - ds_node.calculate_dataset_depth() - return ds_node + if is_dataset(path): + return DatasetNode(path, *args, **kwargs) else: return DirectoryNode(path, *args, **kwargs) - @staticmethod - def is_dataset(path): - """ - We infer that a directory is a dataset if it is either: - (A) installed, or - (B) not installed, but it has an installed superdatset. - Only consider datalad datasets, not plain git/git-annex repos. - """ - ds = require_dataset(path, check_installed=False) - - # detect if it is an installed datalad-proper dataset - # (as opposed to git/git-annex repo). - # could also query `ds.id`, but checking just for existence - # of config file is quicker. - if os.path.isfile(os.path.join(ds.path, ".datalad", "config")): - return True - - # if it is not installed, check if it has an installed superdataset. - # instead of querying ds.is_installed() (which checks if the - # directory has the .git folder), we check if the directory - # is empty (faster) -- as e.g. after a non-recursive `datalad clone` - def is_empty_dir(): - with os.scandir(path) as contents: - if any(contents): - return False - return True - - if is_empty_dir(): - superds = ds.get_superdataset(datalad_only=True, topmost=False, - registered_only=True) - if superds is not None: - return True - - # TODO: do we have a way to detect a datalad dataset if it - # is not installed and it is not a subdataset? - return False - class DatasetNode(DirectoryNode): COLOR = ansi_colors.MAGENTA From c50f7b93ec06727baa53c514d0dce922529f5c81 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 15 Jul 2022 14:34:31 +0200 Subject: [PATCH 030/131] WIP: refactor using pathlib for simpler node generation --- datalad_next/tests/test_tree.py | 207 ++++++++++------- datalad_next/tree.py | 381 +++++++++++++++++++++----------- 2 files changed, 387 insertions(+), 201 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index c7ab1f87..cbafdde2 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,6 +1,7 @@ import os from os.path import join as opj from datetime import datetime +from pathlib import Path import pytest from datalad.distribution.dataset import Dataset @@ -12,7 +13,7 @@ ) from datalad.utils import rmtemp -from ..tree import Tree +from ..tree import Tree, DatasetTree """ Tests for datalad tree. @@ -74,7 +75,7 @@ def path_no_ds(): } temp_dir_root = create_temp_dir_tree(dir_tree) - yield temp_dir_root + yield Path(temp_dir_root).resolve() rmtemp(temp_dir_root) assert not os.path.exists(temp_dir_root) @@ -96,20 +97,14 @@ def path_ds(): "superds1": { "sd1_file0": "", "sd1_dir0": { - "sd1_d0_repo0": { - "INFO.txt": "", - "test.dat": "" - }, + "sd1_d0_repo0": {}, "sd1_d0_subds0": {}, }, "sd1_ds0": {}, # not registered as subdataset "sd1_subds0": {}, # not installed (drop all) }, # plain git repo (contents are defined in BasicGitTestRepo) - "repo0": { - "INFO.txt": "", - "test.dat": "" - }, + "repo0": {}, "file0": "", } } @@ -131,7 +126,7 @@ def path_ds(): sd1_subds0 = superds1.create("sd1_subds0", force=True) sd1_subds0.drop(what='all', reckless='kill', recursive=True) - yield temp_dir_root + yield Path(temp_dir_root).resolve() # delete temp dir rmtemp(temp_dir_root) @@ -154,7 +149,7 @@ def format_param_ids(val): "depth": 1, "include_files": False, "include_hidden": False, - "expected_stats_str": "3 directories, 0 datasets, 0 files", + "expected_stats_str": "0 datasets, 3 directories, 0 files", "expected_str": """ ├── dir0/ ├── dir1/ @@ -165,7 +160,7 @@ def format_param_ids(val): "depth": 3, "include_files": False, "include_hidden": False, - "expected_stats_str": "6 directories, 0 datasets, 0 files", + "expected_stats_str": "0 datasets, 6 directories, 0 files", "expected_str": """ ├── dir0/ ├── dir1/ @@ -179,83 +174,83 @@ def format_param_ids(val): "depth": 1, "include_files": True, "include_hidden": False, - "expected_stats_str": "3 directories, 0 datasets, 2 files", + "expected_stats_str": "0 datasets, 3 directories, 2 files", "expected_str": """ -├── file0 -├── file1 ├── dir0/ ├── dir1/ -└── dir2/ +├── dir2/ +├── file0 +└── file1 """ }, { "depth": 3, "include_files": True, "include_hidden": False, - "expected_stats_str": "6 directories, 0 datasets, 8 files", + "expected_stats_str": "0 datasets, 6 directories, 8 files", "expected_str": """ -├── file0 -├── file1 ├── dir0/ ├── dir1/ | └── dir1_file0 -└── dir2/ - ├── dir2_file0 - ├── dir2_file1 - ├── dir2_dir0/ - ├── dir2_dir1/ - | └── dir2_dir1_file0 - └── dir2_dir2/ - ├── dir2_dir2_file0 - └── dir2_dir2_file1 +├── dir2/ +| ├── dir2_dir0/ +| ├── dir2_dir1/ +| | └── dir2_dir1_file0 +| ├── dir2_dir2/ +| | ├── dir2_dir2_file0 +| | └── dir2_dir2_file1 +| ├── dir2_file0 +| └── dir2_file1 +├── file0 +└── file1 """ }, { "depth": 1, "include_files": True, "include_hidden": True, - "expected_stats_str": "4 directories, 0 datasets, 3 files", + "expected_stats_str": "0 datasets, 4 directories, 3 files", "expected_str": """ -├── .file2 -├── file0 -├── file1 ├── .dir3/ +├── .file2 ├── dir0/ ├── dir1/ -└── dir2/ +├── dir2/ +├── file0 +└── file1 """ }, { "depth": 3, "include_files": True, "include_hidden": True, - "expected_stats_str": "7 directories, 0 datasets, 11 files", + "expected_stats_str": "0 datasets, 7 directories, 11 files", "expected_str": """ -├── .file2 -├── file0 -├── file1 ├── .dir3/ | ├── .dir3_file1 | └── dir3_file0 +├── .file2 ├── dir0/ ├── dir1/ | └── dir1_file0 -└── dir2/ - ├── dir2_file0 - ├── dir2_file1 - ├── dir2_dir0/ - ├── dir2_dir1/ - | └── dir2_dir1_file0 - └── dir2_dir2/ - ├── dir2_dir2_file0 - └── dir2_dir2_file1 +├── dir2/ +| ├── dir2_dir0/ +| ├── dir2_dir1/ +| | └── dir2_dir1_file0 +| ├── dir2_dir2/ +| | ├── dir2_dir2_file0 +| | └── dir2_dir2_file1 +| ├── dir2_file0 +| └── dir2_file1 +├── file0 +└── file1 """ }, { "depth": 1, "include_files": False, "include_hidden": True, - "expected_stats_str": "4 directories, 0 datasets, 0 files", + "expected_stats_str": "0 datasets, 4 directories, 0 files", "expected_str": """ ├── .dir3/ ├── dir0/ @@ -267,7 +262,7 @@ def format_param_ids(val): "depth": 3, "include_files": False, "include_hidden": True, - "expected_stats_str": "7 directories, 0 datasets, 0 files", + "expected_stats_str": "0 datasets, 7 directories, 0 files", "expected_str": """ ├── .dir3/ ├── dir0/ @@ -284,7 +279,7 @@ def format_param_ids(val): matrix_ds = [ { "depth": 1, - "expected_stats_str": "1 directories, 2 datasets, 0 files", + "expected_stats_str": "2 datasets, 1 directories, 0 files", "expected_str": """ ├── repo0/ ├── superds0/ [DS~0] @@ -293,7 +288,7 @@ def format_param_ids(val): }, { "depth": 4, - "expected_stats_str": "3 directories, 7 datasets, 0 files", + "expected_stats_str": "7 datasets, 3 directories, 0 files", "expected_str": """ ├── repo0/ ├── superds0/ [DS~0] @@ -309,6 +304,55 @@ def format_param_ids(val): }, ] +matrix_max_ds_depth = [ + { + "dataset_depth": 0, + "depth": 0, + "expected_str": """ +├── superds0/ [DS~0] +└── superds1/ [DS~0] + └── sd1_ds0/ [DS~0] +""" + }, + { + "dataset_depth": 0, + "depth": 1, + "expected_str": """ +├── superds0/ [DS~0] +└── superds1/ [DS~0] + ├── sd1_dir0/ + └── sd1_ds0/ [DS~0] +""" + }, + { + "dataset_depth": 1, + "depth": 0, + "expected_str": """ +├── superds0/ [DS~0] +| └── sd0_subds0/ [DS~1] +└── superds1/ [DS~0] + ├── sd1_dir0/ + | └── sd1_d0_subds0/ [DS~1] + ├── sd1_ds0/ [DS~0] + └── sd1_subds0/ [DS~1, not installed] +""" + }, + { + "dataset_depth": 1, + "depth": 2, + "expected_str": """ +├── superds0/ [DS~0] +| └── sd0_subds0/ [DS~1] +└── superds1/ [DS~0] + ├── sd1_dir0/ + | ├── sd1_d0_repo0/ + | └── sd1_d0_subds0/ [DS~1] + ├── sd1_ds0/ [DS~0] + └── sd1_subds0/ [DS~1, not installed] +""" + }, +] + def build_param_matrix(matrix, params): """Turn inner dicts into lists (required by pytest parametrize)""" @@ -336,13 +380,16 @@ def test_print_tree_with_params_no_ds( root = os.path.join(path_no_ds, "root") tree = Tree( root, max_depth=depth, - include_files=include_files, include_hidden=include_hidden) - # skip the first line with the root directory - # as we will test it separately + include_files=include_files, include_hidden=include_hidden, + skip_root=True # skip the first line with the root directory + ) lines = tree.print_line() - next(lines) # skip the first line (root dir) actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline + print("expecte:") + print(expected_res) + print("actual:") + print(actual_res) assert_str_equal(expected_res, actual_res) @@ -354,11 +401,10 @@ def test_root_path_is_normalized(path_no_ds, root_dir_name): Test that root path in the first line of string output is normalized path """ - root = os.path.join(path_no_ds, root_dir_name) + root = path_no_ds / root_dir_name tree = Tree(root, max_depth=0) - root_path = next(tree.print_line()) # first line of tree output - expected = os.path.join(path_no_ds, "root") - actual = root_path + expected = str(path_no_ds / "root") + actual = next(tree.print_line()) # first line of tree output assert_str_equal(expected, actual) @@ -379,7 +425,7 @@ def test_print_tree_fails_for_nonexistent_directory(): def test_print_stats_no_ds( path_no_ds, depth, include_files, include_hidden, expected_stats_str ): - root = os.path.join(path_no_ds, 'root') + root = path_no_ds / 'root' tree = Tree( root, max_depth=depth, include_files=include_files, include_hidden=include_hidden @@ -390,7 +436,7 @@ def test_print_stats_no_ds( def test_tree_to_string(path_no_ds): - root = os.path.join(path_no_ds, 'root') + root = path_no_ds / 'root' tree = Tree(root, 3) actual = tree.to_string() expected = "\n".join(tree._lines) @@ -409,7 +455,7 @@ def test_tree_to_string(path_no_ds): def test_print_tree_with_params_with_ds( path_ds, depth, expected_str ): - root = os.path.join(path_ds, "root") + root = path_ds / "root" tree = Tree( root, max_depth=depth, skip_root=True # skip the first line with the root directory @@ -429,27 +475,38 @@ def test_print_tree_with_params_with_ds( def test_print_stats_with_ds( path_ds, depth, expected_stats_str ): - root = os.path.join(path_ds, 'root') + root = path_ds / 'root' tree = Tree(root, max_depth=depth).build() actual_res = tree.stats() expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) -def test_print_tree_full_paths(): - # run in the cwd so detecting full paths is easier - tree = Tree('.', max_depth=1, full_paths=True) - # get the second line (first child, hopefully exists) - lines = tree.print_line() - next(lines) # skip the first line (root dir) - first_child = next(lines) - assert_re_in(r"(?:└──|├──) \./", first_child) - - def test_print_tree_depth_zero(path_no_ds): - root = os.path.join(path_no_ds, "root") + root = path_no_ds / "root" tree = Tree(root, max_depth=0, include_files=True) # should have no effect actual = tree.to_string() - expected = root + expected = str(root) assert_str_equal(expected, actual) + + +param_names = ["dataset_depth", "depth", "expected_str"] + + +@pytest.mark.parametrize( + param_names, build_param_matrix(matrix_max_ds_depth, param_names), + ids=format_param_ids +) +def test_print_tree_with_max_dataset_depth( + path_ds, dataset_depth, depth, expected_str +): + root = path_ds / "root" + tree = DatasetTree( + root, max_depth=depth, max_dataset_depth=dataset_depth, + skip_root=True) + lines = tree.print_line() + actual_res = "\n".join(l for l in lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline + assert_str_equal(expected_res, actual_res) + diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 38263992..e70d642b 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -45,6 +45,7 @@ import logging import os from functools import wraps, lru_cache +from pathlib import Path from datalad.interface.base import ( Interface, @@ -139,15 +140,23 @@ def __call__( include_hidden=False, full_paths=False, ): - tree = Tree( - path, - max_depth=depth, - dataset_max_depth=dataset_depth, - datasets_only=datasets_only, - include_files=include_files, - include_hidden=include_hidden, - full_paths=full_paths - ) + if dataset_depth is not None: + tree = DatasetTree( + path, + max_depth=depth, + max_dataset_depth=dataset_depth, + include_files=include_files, + include_hidden=include_hidden, + full_paths=full_paths + ) + else: + tree = Tree( + path, + max_depth=depth, + include_files=include_files, + include_hidden=include_hidden, + full_paths=full_paths + ) for line in tree.print_line(): # print one line at a time to improve UX / perceived speed @@ -229,7 +238,50 @@ def is_path_child_of_parent(child, parent): parent_abs = os.path.abspath(parent) child_abs = os.path.abspath(child) return os.path.commonpath([parent_abs]) == \ - os.path.commonpath([parent_abs, child_abs]) + os.path.commonpath([parent_abs, child_abs]) + +class _TreeNode(object): + """ + Base class for a directory or file represented as a single + tree node and printed as single line of the 'tree' output. + """ + COLOR = None # ANSI color for the path, if terminal color are enabled + + def __init__(self, path: str, depth: int, is_last_child: bool, + use_full_paths=False): + self.path = path # path relative to tree root + self.depth = depth # depth in the directory tree + self.is_last_child = is_last_child # if it is last item of its subtree + self.use_full_paths = use_full_paths + + def __eq__(self, other): + return self.path == other.path + + def __hash__(self): + return hash(self.path) + + def __str__(self): + if self.depth == 0 or self.use_full_paths: + path = self.path + else: + path = os.path.basename(self.path) + + if self.COLOR is not None: + path = ansi_colors.color_word(path, self.COLOR) + + prefix = "" + if self.depth > 0: + prefix = "└── " if self.is_last_child else "├── " + + return prefix + str(path) + + def _get_tree_root(self): + """Calculate tree root path from node path and depth""" + root = self.path + for _ in range(self.depth): + root = os.path.dirname(root) + return root + class Tree(object): @@ -239,42 +291,37 @@ class Tree(object): of the whole tree and the statistics (counts of different node types). """ - def __init__(self, root: str, max_depth=None, + def __init__(self, root: Path, max_depth=None, include_files=False, include_hidden=False, - full_paths=False, skip_root=False): + full_paths=False, exclude_datasets=False, + skip_root=False): - if not os.path.isdir(root): + # TODO: root should already be given as Path object + self.root = Path(root).resolve() + if not self.root.is_dir(): raise ValueError(f"directory '{root}' not found") - self.root = os.path.normpath(root) self.max_depth = max_depth if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - self.dataset_max_depth = dataset_max_depth self.include_files = include_files self.include_hidden = include_hidden self.full_paths = full_paths + self.exclude_datasets = exclude_datasets + self.skip_root = skip_root # do not print first line with root path self._lines = [] # list of lines of output string - self._last_children = set([]) # last child of each subtree # TODO: stats should automatically register all concrete _TreeNode classes self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} - def _current_depth(self, path: str): + def _get_depth(self, path: Path): """Directory depth of current path relative to root of the tree""" - # directory depth can be safely inferred from the number of - # path separators in path, since pathsep characters are illegal - # in file or directory names. - return path.count(os.path.sep) - self.root.count(os.path.sep) - - def _is_last_child(self, path): - """Whether an item is the last child within its subtree""" - return path in self._last_children + return len(path.relative_to(self.root).parts) def _is_max_depth_reached(self, path): if self.max_depth is not None: - return self._current_depth(path) == self.max_depth + return self._get_depth(path) == self.max_depth return False # unlimited depth def stats(self): @@ -282,12 +329,11 @@ def stats(self): Equivalent of tree command's 'report line' at the end of the tree output. The 3 node types (directory, dataset, file) are mutually exclusive, - so their total is the total count of nodes. - Only counts contents below the root directory, does not count - the root itself. + so their sum equals to the total node count. + Does not count the root itself, only the contents below the root. """ - return f"{self._stats['DirectoryNode']} directories, " \ - f"{self._stats['DatasetNode']} datasets, " \ + return f"{self._stats['DatasetNode']} datasets, " \ + f"{self._stats['DirectoryNode']} directories, " \ f"{self._stats['FileNode']} files" def _total_nodes(self): @@ -300,68 +346,60 @@ def build(self): self.to_string() return self - @increment_node_count - def _generate_nodes(self): + def _new_node(self, node: _TreeNode): + """A helper to instantiate TreeNode and increment node count""" + node_type = node.__class__.__name__ + if node_type not in self._stats: + raise ValueError( + f"No stats collected for unknown node type '{node_type}'" + ) + if node.depth > 0: # we do not count the root directory + self._stats[node_type] += 1 + return node + + def _generate_dataset_tree_nodes(self, dir_path: Path, is_last_child=True): + pass + + def _generate_nodes(self, dir_path: Path, is_last_child=True): """ Yields ``_TreeNode`` objects, each representing a directory or dataset or file. Nodes are traversed in depth-first order. """ - - # os.walk() does depth-first traversal - for path, dirs, files in os.walk(self.root): - - if self._is_max_depth_reached(path): - # empty in-place the list of next directories to - # traverse, which effectively stops os.walk's walking - dirs[:] = [] - files[:] = [] - - # modify os.walk()'s output in-place to prevent - # traversal into selected directories + def is_excluded(path): + exclusion_criteria = [] + if not self.include_files: + exclusion_criteria.append(path.is_file()) if not self.include_hidden: - dirs[:] = [d for d in dirs if not d.startswith(".")] - files[:] = [f for f in files if not f.startswith(".")] - - if self.exclude_datasets: - dirs[:] = [d for d in dirs - if not is_dataset(os.path.join(path, d))] - - # sort directories and files alphabetically in-place. - # note that directories and files are sorted separately. - # files are all listed before the directories - # (just by convention, no particular reason). - dirs.sort() - files.sort() - - # check if node is the last child within its subtree - # (needed for displaying special end-of-subtree prefix) - if dirs or files: # if there is a next level - # files are listed first, directories come last. - # so we take the last subdirectory if it exists, - # otherwise the last file. - self._last_children.add( - os.path.join(path, dirs[-1] if dirs else files[-1]) - ) - current_depth = self._current_depth(path) - - # handle directories/datasets - dir_or_ds = DirectoryOrDatasetNode( - path, current_depth, self._is_last_child(path), - use_full_paths=self.full_paths) - yield dir_or_ds - - if self.max_depth == 0 and current_depth == 0: - break # just yield the root dir and exit - - # handle files - if self.include_files: - for file in files: - file_path = os.path.join(path, file) - yield FileNode( - file_path, current_depth + 1, - self._is_last_child(file_path), - use_full_paths=self.full_paths + exclusion_criteria.append(path.name.startswith(".")) + return any(exclusion_criteria) + + yield self._new_node(DirectoryOrDatasetNode( + dir_path, self._get_depth(dir_path), is_last_child + )) + + if self._get_depth(dir_path) < self.max_depth: + # apply exclusion filter + selected_children = ( + p for p in Path(dir_path).iterdir() + if not is_excluded(p) + ) + # sort directory contents alphabetically + children = sorted(list(selected_children)) + + for child in children: + + # check if node is the last child within its subtree + # (needed for displaying special end-of-subtree prefix) + is_last_child = (child == children[-1]) + + if child.is_file(): + yield self._new_node( + FileNode(child, self._get_depth(child), is_last_child) ) + elif child.is_dir(): + # recurse into subdirectories + for node in self._generate_nodes(child, is_last_child): + yield node def to_string(self): """Return complete tree as string""" @@ -399,8 +437,10 @@ def _yield_lines(self): # is the last node of its own subtree. levels_with_exhausted_subtree = set([]) - for node in self._generate_nodes(): - lgr.debug(node) + for node in self._generate_nodes(self.root): + + if self.skip_root and node.depth == 0: + continue if node.is_last_child: # last child of its subtree levels_with_exhausted_subtree.add(node.depth) @@ -424,55 +464,140 @@ def _yield_lines(self): yield line -class _TreeNode(object): - """ - Base class for a directory or file represented as a single - tree node and printed as single line of the 'tree' output. - """ - COLOR = None # ANSI color for the path, if terminal color are enabled +class DatasetTree(Tree): + def __init__(self, *args, max_dataset_depth=0, **kwargs): + super().__init__(*args, **kwargs) + # by default, do not recurse into datasets' subdirectories + # (that are not datasets themselves) + if self.max_depth is None: + self.max_depth = 0 + + self.max_dataset_depth = max_dataset_depth + # meaning of 'dataset depth': + # -1 means no datasets encountered, + # 0 means top-level superdatasets (relative to the tree root), + # 1 means first-level subdatasets (relative to the tree root). + self._current_dataset_depth = -1 # -1 means no datasets encountered + + def _is_max_dataset_depth_reached(self): + if self.max_dataset_depth is not None: + return self._current_dataset_depth > self.max_dataset_depth + return False # unlimited depth - def __init__(self, path: str, depth: int, is_last_child: bool, - use_full_paths=False): - self.path = path - self.depth = depth # depth in the directory tree - self.is_last_child = is_last_child # if it is last item of its subtree - self.use_full_paths = use_full_paths + def _generate_nodes(self): + # keep track of datasets' parent directories that have already + # been yielded, so we do not yield them twice + visited_parents = set([]) + for path, dirs, files in os.walk(self.root): - def __str__(self): - if self.depth == 0 or self.use_full_paths: - path = self.path - else: - path = os.path.basename(self.path) + # optimization: do not traverse the .git folder + # (we're not going to find datasets in there) + dirs[:] = [d for d in dirs if d != ".git"] - if self.COLOR is not None: - path = ansi_colors.color_word(path, self.COLOR) + # sort directories alphabetically in-place + dirs.sort() - prefix = "" - if self.depth > 0: - prefix = "└── " if self.is_last_child else "├── " + # update the last child nodes + if dirs: # if there is a next level + self._last_children.add( + os.path.join(path, dirs[-1]) + ) - return prefix + path + if is_dataset(path): + current_depth = self._get_depth(path) + + ds = DatasetNode( + path, current_depth, self._is_last_child(path), + use_full_paths=self.full_paths) + self._current_dataset_depth = ds.ds_depth + + if self._is_max_dataset_depth_reached(): + # we do not prune the walk (empty the list of + # directories to traverse next), but just skip to + # to the next node. this is because datasets + # of a given depth *could* be located at any + # directory depth, even below datasets at the same + # or deeper level. + continue + + # yield intermediate directories + for parent, parent_depth in ds.parent_nodes(): + if parent not in visited_parents: + visited_parents.add(parent) + yield DirectoryOrDatasetNode( + parent, + parent_depth, + self._is_last_child(parent), + use_full_paths=self.full_paths + ) + visited_parents.add(ds.path) + yield ds + + if self.max_depth > 0: + # yield directories/files underneath dataset. + # here, the `max_depth` parameter refers to the + # directory depth of each dataset's subtree. + subtree = Tree( + ds.path, max_depth=self.max_depth, + full_paths=self.full_paths, + include_files=self.include_files, + include_hidden=self.include_hidden, + exclude_datasets=True + ) + for subtree_node in subtree._generate_nodes(): + # skip root node and dataset nodes + # (will be yielded in main loop) + if subtree_node.depth > 0 and \ + not isinstance(subtree_node, DatasetNode): + # increment depth by offset of root + subtree_node.depth += ds.depth + if isinstance(subtree_node, DirectoryNode): + visited_parents.add(subtree_node.path) - def _get_tree_root(self): - """Calculate tree root path from node path and depth""" - root = self.path - for _ in range(self.depth): - root = os.path.dirname(root) - return root + yield subtree_node class DirectoryNode(_TreeNode): COLOR = ansi_colors.BLUE + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def __str__(self): string = super().__str__() if self.depth > 0: return string + "/" return string + def parent_nodes(self): + """ + Generate parent nodes of the current node, excluding the tree root + """ + if self.depth > 0: + parent = self.path + parent_paths = [] + for _ in range(self.depth): + parent = os.path.dirname(parent) + parent_paths.append(parent) + parent_paths = parent_paths[::-1] # sort from root to current node + # print(f"path: {self.path}") + # print(f"parents: {parent_paths}") + for depth, path in enumerate(parent_paths): + # print(depth_offset, p) + yield path, depth + # + # for ix, parent_path in enumerate(parent_paths, start=1): + # yield DirectoryOrDatasetNode( + # path=os.path.join(root, parent_path), + # depth=self.depth - ix, + # is_last_child=True, # temporary + # use_full_paths=self.use_full_paths + # ) + class FileNode(_TreeNode): - pass + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) class DirectoryOrDatasetNode(_TreeNode): @@ -495,12 +620,13 @@ def __init__(self, *args, **kwargs): self.ds = require_dataset(self.path, check_installed=False) self.is_installed = self.ds.is_installed() - self._ds_depth = None - self._absolute_ds_depth = None + self.ds_depth = None + self.ds_absolute_depth = None + self.calculate_dataset_depth() def __str__(self): install_flag = ", not installed" if not self.is_installed else "" - suffix = f" [DS~{self._absolute_ds_depth}{install_flag}]" + suffix = f" [DS~{self.ds_absolute_depth}{install_flag}]" return super().__str__() + suffix def calculate_dataset_depth(self): @@ -509,8 +635,9 @@ def calculate_dataset_depth(self): 1. subdataset depth relative to the tree root 2. absolute subdataset depth in the full hierarchy """ - self._ds_depth = 0 - self._absolute_ds_depth = 0 + # TODO: run this already in the constructor + self.ds_depth = 0 + self.ds_absolute_depth = 0 ds = self.ds @@ -526,10 +653,12 @@ def calculate_dataset_depth(self): # it is a top-level dataset, we are done break - self._absolute_ds_depth += 1 + self.ds_absolute_depth += 1 + # TODO: Path.is_relative_to() + # TODO: rglob() relative glob if is_path_child_of_parent(superds.path, self._get_tree_root()): # if the parent dataset is underneath the tree # root, we increment the relative depth - self._ds_depth += 1 + self.ds_depth += 1 ds = superds From 9a1b6fcd939709e790233930fa2fbc4a1760852d Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 15 Jul 2022 15:24:31 +0200 Subject: [PATCH 031/131] separate low-level vs public node generator to allow using decorator to increment node count --- datalad_next/tree.py | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index e70d642b..6c6e4c6c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -283,7 +283,6 @@ def _get_tree_root(self): return root - class Tree(object): """ Main class for building and serializing a directory tree. @@ -346,25 +345,8 @@ def build(self): self.to_string() return self - def _new_node(self, node: _TreeNode): - """A helper to instantiate TreeNode and increment node count""" - node_type = node.__class__.__name__ - if node_type not in self._stats: - raise ValueError( - f"No stats collected for unknown node type '{node_type}'" - ) - if node.depth > 0: # we do not count the root directory - self._stats[node_type] += 1 - return node - - def _generate_dataset_tree_nodes(self, dir_path: Path, is_last_child=True): - pass - - def _generate_nodes(self, dir_path: Path, is_last_child=True): - """ - Yields ``_TreeNode`` objects, each representing a directory or - dataset or file. Nodes are traversed in depth-first order. - """ + def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): + """Recursively yield directory tree starting from ```dir_path``""" def is_excluded(path): exclusion_criteria = [] if not self.include_files: @@ -373,9 +355,9 @@ def is_excluded(path): exclusion_criteria.append(path.name.startswith(".")) return any(exclusion_criteria) - yield self._new_node(DirectoryOrDatasetNode( + yield DirectoryOrDatasetNode( dir_path, self._get_depth(dir_path), is_last_child - )) + ) if self._get_depth(dir_path) < self.max_depth: # apply exclusion filter @@ -393,14 +375,22 @@ def is_excluded(path): is_last_child = (child == children[-1]) if child.is_file(): - yield self._new_node( - FileNode(child, self._get_depth(child), is_last_child) - ) + yield FileNode(child, self._get_depth(child), is_last_child) elif child.is_dir(): # recurse into subdirectories - for node in self._generate_nodes(child, is_last_child): + for node in self._generate_tree_nodes(child, is_last_child): yield node + @increment_node_count + def generate_nodes(self): + """ + Traverse a directory tree starting from the root path. + Yields ``_TreeNode`` objects, each representing a directory or + dataset or file. Nodes are traversed in depth-first order. + """ + for node in self._generate_tree_nodes(self.root): + yield node + def to_string(self): """Return complete tree as string""" if not self._lines: @@ -437,7 +427,7 @@ def _yield_lines(self): # is the last node of its own subtree. levels_with_exhausted_subtree = set([]) - for node in self._generate_nodes(self.root): + for node in self.generate_nodes(): if self.skip_root and node.depth == 0: continue From 537a75309a5f92c4e0fa9043b059a51a1c9cad55 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 24 Jul 2022 17:33:14 +0200 Subject: [PATCH 032/131] add tests for report line with max_datasets --- datalad_next/tests/test_tree.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index cbafdde2..23aaf3eb 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -308,6 +308,7 @@ def format_param_ids(val): { "dataset_depth": 0, "depth": 0, + "expected_stats_str": "3 datasets, 0 directories, 0 files", "expected_str": """ ├── superds0/ [DS~0] └── superds1/ [DS~0] @@ -317,6 +318,7 @@ def format_param_ids(val): { "dataset_depth": 0, "depth": 1, + "expected_stats_str": "3 datasets, 1 directories, 0 files", "expected_str": """ ├── superds0/ [DS~0] └── superds1/ [DS~0] @@ -327,6 +329,7 @@ def format_param_ids(val): { "dataset_depth": 1, "depth": 0, + "expected_stats_str": "6 datasets, 1 directories, 0 files", "expected_str": """ ├── superds0/ [DS~0] | └── sd0_subds0/ [DS~1] @@ -340,6 +343,7 @@ def format_param_ids(val): { "dataset_depth": 1, "depth": 2, + "expected_stats_str": "6 datasets, 2 directories, 0 files", "expected_str": """ ├── superds0/ [DS~0] | └── sd0_subds0/ [DS~1] @@ -386,7 +390,7 @@ def test_print_tree_with_params_no_ds( lines = tree.print_line() actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline - print("expecte:") + print("expected:") print(expected_res) print("actual:") print(actual_res) @@ -508,5 +512,26 @@ def test_print_tree_with_max_dataset_depth( lines = tree.print_line() actual_res = "\n".join(l for l in lines) + "\n" expected_res = expected_str.lstrip("\n") # strip first newline + print("expected:") + print(expected_res) + print("actual:") + print(actual_res) assert_str_equal(expected_res, actual_res) + +param_names = ["dataset_depth", "depth", "expected_stats_str"] + + +@pytest.mark.parametrize( + param_names, build_param_matrix(matrix_max_ds_depth, param_names) +) +def test_print_stats_with_max_dataset_depth( + path_ds, dataset_depth, depth, expected_stats_str +): + root = path_ds / 'root' + tree = DatasetTree(root, max_depth=depth, max_dataset_depth=dataset_depth).build() + actual_res = tree.stats() + expected_res = expected_stats_str + assert_str_equal(expected_res, actual_res) + + From 0ced1e9758a6193e1763f8f6a5b33379c80da277 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 24 Jul 2022 17:39:09 +0200 Subject: [PATCH 033/131] precalculate dataset location (WIP, tests are failing) --- datalad_next/tree.py | 264 ++++++++++++++++++++++--------------------- 1 file changed, 137 insertions(+), 127 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 6c6e4c6c..68197699 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -210,7 +210,7 @@ def is_dataset(path): # (as opposed to git/git-annex repo). # could also query `ds.id`, but checking just for existence # of config file is quicker. - if os.path.isfile(os.path.join(ds.path, ".datalad", "config")): + if Path(Path(ds.path) / ".datalad" / "config").is_file(): return True # if it is not installed, check if it has an installed superdataset. @@ -218,10 +218,7 @@ def is_dataset(path): # directory has the .git folder), we check if the directory # is empty (faster) -- as e.g. after a non-recursive `datalad clone` def is_empty_dir(): - with os.scandir(path) as contents: - if any(contents): - return False - return True + return not any(Path(ds.path).iterdir()) if is_empty_dir(): superds = ds.get_superdataset(datalad_only=True, topmost=False, @@ -234,12 +231,6 @@ def is_empty_dir(): return False -def is_path_child_of_parent(child, parent): - parent_abs = os.path.abspath(parent) - child_abs = os.path.abspath(child) - return os.path.commonpath([parent_abs]) == \ - os.path.commonpath([parent_abs, child_abs]) - class _TreeNode(object): """ Base class for a directory or file represented as a single @@ -258,7 +249,7 @@ def __eq__(self, other): return self.path == other.path def __hash__(self): - return hash(self.path) + return hash(str(self.path)) def __str__(self): if self.depth == 0 or self.use_full_paths: @@ -275,12 +266,24 @@ def __str__(self): return prefix + str(path) - def _get_tree_root(self): + @property + def tree_root(self): """Calculate tree root path from node path and depth""" - root = self.path - for _ in range(self.depth): - root = os.path.dirname(root) - return root + return self.parents[0] + + @property + def parents(self): + """ + List of parent paths (beginning from the tree root) + in top-down order. + """ + parents_from_tree_root = [] + for depth, path in enumerate(Path(self.path).parents): + if depth >= self.depth: + break + parents_from_tree_root.append(path) + + return parents_from_tree_root[::-1] # top-down order class Tree(object): @@ -290,9 +293,10 @@ class Tree(object): of the whole tree and the statistics (counts of different node types). """ - def __init__(self, root: Path, max_depth=None, + def __init__(self, + root: Path, max_depth=None, include_files=False, include_hidden=False, - full_paths=False, exclude_datasets=False, + full_paths=False, traverse_datasets=True, skip_root=False): # TODO: root should already be given as Path object @@ -307,7 +311,7 @@ def __init__(self, root: Path, max_depth=None, self.include_files = include_files self.include_hidden = include_hidden self.full_paths = full_paths - self.exclude_datasets = exclude_datasets + self.traverse_datasets = traverse_datasets self.skip_root = skip_root # do not print first line with root path self._lines = [] # list of lines of output string @@ -318,10 +322,10 @@ def _get_depth(self, path: Path): """Directory depth of current path relative to root of the tree""" return len(path.relative_to(self.root).parts) - def _is_max_depth_reached(self, path): - if self.max_depth is not None: - return self._get_depth(path) == self.max_depth - return False # unlimited depth + def _max_depth_reached(self, path): + if self.max_depth is None: + return False # unlimited depth + return self._get_depth(path) >= self.max_depth def stats(self): """ @@ -355,11 +359,23 @@ def is_excluded(path): exclusion_criteria.append(path.name.startswith(".")) return any(exclusion_criteria) - yield DirectoryOrDatasetNode( - dir_path, self._get_depth(dir_path), is_last_child - ) + if not self.skip_root or \ + self.skip_root and self._get_depth(dir_path) > 0: + yield DirectoryOrDatasetNode( + dir_path, self._get_depth(dir_path), is_last_child + ) + + condition = (self.traverse_datasets or + not self.traverse_datasets and (not is_dataset(dir_path) or self._get_depth(dir_path) == 0)) + # if not self.traverse_datasets: + # print(f"dir_path = {dir_path}") + # print(f"self.traverse_datasets = {self.traverse_datasets}") + # print(f" self._get_depth(dir_path) = {self._get_depth(dir_path)}") + # print(f"is_dataset(dir_path) = {is_dataset(dir_path)}") + # print(f"condition = {condition}") + + if not self._max_depth_reached(dir_path) and condition: - if self._get_depth(dir_path) < self.max_depth: # apply exclusion filter selected_children = ( p for p in Path(dir_path).iterdir() @@ -429,9 +445,6 @@ def _yield_lines(self): for node in self.generate_nodes(): - if self.skip_root and node.depth == 0: - continue - if node.is_last_child: # last child of its subtree levels_with_exhausted_subtree.add(node.depth) else: @@ -458,7 +471,7 @@ class DatasetTree(Tree): def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) # by default, do not recurse into datasets' subdirectories - # (that are not datasets themselves) + # (other than paths to nested subdatasets) if self.max_depth is None: self.max_depth = 0 @@ -470,81 +483,105 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): self._current_dataset_depth = -1 # -1 means no datasets encountered def _is_max_dataset_depth_reached(self): - if self.max_dataset_depth is not None: - return self._current_dataset_depth > self.max_dataset_depth - return False # unlimited depth + return self._current_dataset_depth > self.max_dataset_depth + + def _find_all_datasets(self): + """Precalculate the set of all datasets under tree root""" + datasets = set([]) + ds_generator = ( + p + for p in Path(self.root).glob("**/*") + if p.is_dir() and + not p.name == ".git" and # won't find datasets inside .git folder + is_dataset(p) + ) + for path in ds_generator: + ds = DatasetNode(path, self._get_depth(path), is_last_child=False) + self._current_dataset_depth = ds.ds_depth + if not self._is_max_dataset_depth_reached(): + datasets.add(Path(ds.path)) + return datasets + + def _is_last_ds_child(self, path, datasets): + """Takes output of _find_all_datasets()""" + path = Path(path) + parent = path.parent + siblings = sorted([ds for ds in datasets if ds.parent == parent]) + # print(f"path {path.name}: siblings: {[s.name for s in siblings]}") + if not siblings: # means the current path is not a dataset + return True + return siblings[-1] == path - def _generate_nodes(self): + def _generate_ds_with_parents(self): # keep track of datasets' parent directories that have already # been yielded, so we do not yield them twice - visited_parents = set([]) - for path, dirs, files in os.walk(self.root): + self._visited_parents = set([]) - # optimization: do not traverse the .git folder - # (we're not going to find datasets in there) - dirs[:] = [d for d in dirs if d != ".git"] + # precalculate datasets up to max_dataset_depth + all_datasets = self._find_all_datasets() - # sort directories alphabetically in-place - dirs.sort() - - # update the last child nodes - if dirs: # if there is a next level - self._last_children.add( - os.path.join(path, dirs[-1]) - ) - - if is_dataset(path): - current_depth = self._get_depth(path) + ds_tree = Tree( + self.root, + max_depth=None, # unlimited traversal + include_files=False, + include_hidden=True, + skip_root=self.skip_root, + full_paths=self.full_paths + ) - ds = DatasetNode( - path, current_depth, self._is_last_child(path), - use_full_paths=self.full_paths) + for node in ds_tree.generate_nodes(): + ds = node + if isinstance(node, DatasetNode): self._current_dataset_depth = ds.ds_depth - if self._is_max_dataset_depth_reached(): - # we do not prune the walk (empty the list of - # directories to traverse next), but just skip to - # to the next node. this is because datasets - # of a given depth *could* be located at any - # directory depth, even below datasets at the same - # or deeper level. - continue - - # yield intermediate directories - for parent, parent_depth in ds.parent_nodes(): - if parent not in visited_parents: - visited_parents.add(parent) - yield DirectoryOrDatasetNode( - parent, - parent_depth, - self._is_last_child(parent), - use_full_paths=self.full_paths - ) - visited_parents.add(ds.path) - yield ds - - if self.max_depth > 0: - # yield directories/files underneath dataset. - # here, the `max_depth` parameter refers to the - # directory depth of each dataset's subtree. - subtree = Tree( - ds.path, max_depth=self.max_depth, - full_paths=self.full_paths, - include_files=self.include_files, - include_hidden=self.include_hidden, - exclude_datasets=True - ) - for subtree_node in subtree._generate_nodes(): - # skip root node and dataset nodes - # (will be yielded in main loop) - if subtree_node.depth > 0 and \ - not isinstance(subtree_node, DatasetNode): - # increment depth by offset of root - subtree_node.depth += ds.depth - if isinstance(subtree_node, DirectoryNode): - visited_parents.add(subtree_node.path) - - yield subtree_node + if not self._is_max_dataset_depth_reached(): + # yield parent directories if not already done + for depth, parent in enumerate(ds.parents): + if depth == 0 and self.skip_root: + continue + if parent not in self._visited_parents: + self._visited_parents.add(parent) + + yield DirectoryOrDatasetNode( + parent, + depth, + self._is_last_ds_child(parent, all_datasets), + use_full_paths=self.full_paths + ) + + self._visited_parents.add(ds.path) + ds.is_last_child = self._is_last_ds_child(ds.path, + all_datasets) + yield ds + + def _generate_dataset_nodes(self): + ds_node_generator = self._generate_ds_with_parents() + + for node in ds_node_generator: + if isinstance(node, DatasetNode): + ds = node + # yield dataset contents up to `max_depth` levels + subtree = Tree( + ds.path, + max_depth=self.max_depth, + include_files=self.include_files, + include_hidden=self.include_hidden, + skip_root=True, # dataset root has already been yielded + traverse_datasets=False + ) + for sub_node in subtree.generate_nodes(): + if isinstance(sub_node, DatasetNode): + self._visited_parents.update(sub_node.parents) + yield from ds_node_generator + else: + # offset sub-node depth by the parent's depth + sub_node.depth += ds.depth + yield sub_node + + @increment_node_count + def generate_nodes(self): + for node in self._generate_dataset_nodes(): + yield node class DirectoryNode(_TreeNode): @@ -559,31 +596,6 @@ def __str__(self): return string + "/" return string - def parent_nodes(self): - """ - Generate parent nodes of the current node, excluding the tree root - """ - if self.depth > 0: - parent = self.path - parent_paths = [] - for _ in range(self.depth): - parent = os.path.dirname(parent) - parent_paths.append(parent) - parent_paths = parent_paths[::-1] # sort from root to current node - # print(f"path: {self.path}") - # print(f"parents: {parent_paths}") - for depth, path in enumerate(parent_paths): - # print(depth_offset, p) - yield path, depth - # - # for ix, parent_path in enumerate(parent_paths, start=1): - # yield DirectoryOrDatasetNode( - # path=os.path.join(root, parent_path), - # depth=self.depth - ix, - # is_last_child=True, # temporary - # use_full_paths=self.use_full_paths - # ) - class FileNode(_TreeNode): def __init__(self, *args, **kwargs): @@ -644,9 +656,7 @@ def calculate_dataset_depth(self): break self.ds_absolute_depth += 1 - # TODO: Path.is_relative_to() - # TODO: rglob() relative glob - if is_path_child_of_parent(superds.path, self._get_tree_root()): + if Path(superds.path).is_relative_to(self.tree_root): # if the parent dataset is underneath the tree # root, we increment the relative depth self.ds_depth += 1 From 12b61601ef524a19360cfa3d356530b0986f2359 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 25 Jul 2022 21:02:59 +0200 Subject: [PATCH 034/131] replace parameters include_hidden and include_files with generic callable for node exclusion --- datalad_next/tests/test_tree.py | 18 ++- datalad_next/tree.py | 214 +++++++++++++++----------------- 2 files changed, 114 insertions(+), 118 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 23aaf3eb..a3a6b92c 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -13,7 +13,7 @@ ) from datalad.utils import rmtemp -from ..tree import Tree, DatasetTree +from ..tree import Tree, DatasetTree, build_excluded_node_func """ Tests for datalad tree. @@ -384,7 +384,9 @@ def test_print_tree_with_params_no_ds( root = os.path.join(path_no_ds, "root") tree = Tree( root, max_depth=depth, - include_files=include_files, include_hidden=include_hidden, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ), skip_root=True # skip the first line with the root directory ) lines = tree.print_line() @@ -432,7 +434,9 @@ def test_print_stats_no_ds( root = path_no_ds / 'root' tree = Tree( root, max_depth=depth, - include_files=include_files, include_hidden=include_hidden + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ), ).build() actual_res = tree.stats() expected_res = expected_stats_str @@ -488,8 +492,12 @@ def test_print_stats_with_ds( def test_print_tree_depth_zero(path_no_ds): root = path_no_ds / "root" - tree = Tree(root, max_depth=0, - include_files=True) # should have no effect + tree = Tree( + root, + max_depth=0, + # including files should have no effect + exclude_node_func=build_excluded_node_func(include_files=True) + ) actual = tree.to_string() expected = str(root) assert_str_equal(expected, actual) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 68197699..7facdec4 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -140,22 +140,25 @@ def __call__( include_hidden=False, full_paths=False, ): + if dataset_depth is not None: tree = DatasetTree( path, max_depth=depth, max_dataset_depth=dataset_depth, - include_files=include_files, - include_hidden=include_hidden, - full_paths=full_paths + full_paths=full_paths, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ) ) else: tree = Tree( path, max_depth=depth, - include_files=include_files, - include_hidden=include_hidden, - full_paths=full_paths + full_paths=full_paths, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ) ) for line in tree.print_line(): @@ -171,6 +174,22 @@ def __call__( ) +def build_excluded_node_func(include_hidden=False, include_files=False): + """ + Default callable to filter tree nodes. + Takes a Path object of the tree node as input. + If the function returns true, the node will not be displayed. + """ + + def is_excluded(path): + return any(( + path.is_file() if not include_files else False, + path.name.startswith(".") if not include_hidden else False + )) + + return is_excluded + + def increment_node_count(node_generator_func): """ Decorator for incrementing the node count whenever @@ -269,7 +288,10 @@ def __str__(self): @property def tree_root(self): """Calculate tree root path from node path and depth""" - return self.parents[0] + parents = self.parents + if parents: + return parents[0] + return self.path # we are the root @property def parents(self): @@ -295,9 +317,9 @@ class Tree(object): def __init__(self, root: Path, max_depth=None, - include_files=False, include_hidden=False, - full_paths=False, traverse_datasets=True, - skip_root=False): + full_paths=False, + skip_root=False, + exclude_node_func=None): # TODO: root should already be given as Path object self.root = Path(root).resolve() @@ -308,16 +330,23 @@ def __init__(self, if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - self.include_files = include_files - self.include_hidden = include_hidden self.full_paths = full_paths - self.traverse_datasets = traverse_datasets self.skip_root = skip_root # do not print first line with root path + # set default filter criteria + self.exclude_node_func = exclude_node_func or self._default_exclude_func + self._lines = [] # list of lines of output string # TODO: stats should automatically register all concrete _TreeNode classes self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} + @staticmethod + def _default_exclude_func(path: Path): + """ + By default, only include non-hidden directories. + """ + return any((path.is_file(), path.name.startswith("."))) + def _get_depth(self, path: Path): """Directory depth of current path relative to root of the tree""" return len(path.relative_to(self.root).parts) @@ -351,35 +380,18 @@ def build(self): def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): """Recursively yield directory tree starting from ```dir_path``""" - def is_excluded(path): - exclusion_criteria = [] - if not self.include_files: - exclusion_criteria.append(path.is_file()) - if not self.include_hidden: - exclusion_criteria.append(path.name.startswith(".")) - return any(exclusion_criteria) - if not self.skip_root or \ self.skip_root and self._get_depth(dir_path) > 0: yield DirectoryOrDatasetNode( dir_path, self._get_depth(dir_path), is_last_child ) - condition = (self.traverse_datasets or - not self.traverse_datasets and (not is_dataset(dir_path) or self._get_depth(dir_path) == 0)) - # if not self.traverse_datasets: - # print(f"dir_path = {dir_path}") - # print(f"self.traverse_datasets = {self.traverse_datasets}") - # print(f" self._get_depth(dir_path) = {self._get_depth(dir_path)}") - # print(f"is_dataset(dir_path) = {is_dataset(dir_path)}") - # print(f"condition = {condition}") - - if not self._max_depth_reached(dir_path) and condition: + if not self._max_depth_reached(dir_path): # apply exclusion filter selected_children = ( p for p in Path(dir_path).iterdir() - if not is_excluded(p) + if not self.exclude_node_func(p) ) # sort directory contents alphabetically children = sorted(list(selected_children)) @@ -394,8 +406,7 @@ def is_excluded(path): yield FileNode(child, self._get_depth(child), is_last_child) elif child.is_dir(): # recurse into subdirectories - for node in self._generate_tree_nodes(child, is_last_child): - yield node + yield from self._generate_tree_nodes(child, is_last_child) @increment_node_count def generate_nodes(self): @@ -404,6 +415,11 @@ def generate_nodes(self): Yields ``_TreeNode`` objects, each representing a directory or dataset or file. Nodes are traversed in depth-first order. """ + # because the underlying generator is recursive, we cannot + # directly decorate it with `increment_node_count` (since + # it would count twice whenever the function recurses). + # so we decorate a separate function where we just yield + # from the underlying decorator. for node in self._generate_tree_nodes(self.root): yield node @@ -468,6 +484,10 @@ def _yield_lines(self): class DatasetTree(Tree): + """ + DatasetTree is a Tree where hierarchy depth refers to the + subdataset hierarchy level, instead of directory depth. + """ def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) # by default, do not recurse into datasets' subdirectories @@ -482,61 +502,39 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # 1 means first-level subdatasets (relative to the tree root). self._current_dataset_depth = -1 # -1 means no datasets encountered - def _is_max_dataset_depth_reached(self): - return self._current_dataset_depth > self.max_dataset_depth - - def _find_all_datasets(self): - """Precalculate the set of all datasets under tree root""" - datasets = set([]) - ds_generator = ( - p - for p in Path(self.root).glob("**/*") - if p.is_dir() and - not p.name == ".git" and # won't find datasets inside .git folder - is_dataset(p) - ) - for path in ds_generator: - ds = DatasetNode(path, self._get_depth(path), is_last_child=False) - self._current_dataset_depth = ds.ds_depth - if not self._is_max_dataset_depth_reached(): - datasets.add(Path(ds.path)) - return datasets - - def _is_last_ds_child(self, path, datasets): - """Takes output of _find_all_datasets()""" - path = Path(path) - parent = path.parent - siblings = sorted([ds for ds in datasets if ds.parent == parent]) - # print(f"path {path.name}: siblings: {[s.name for s in siblings]}") - if not siblings: # means the current path is not a dataset - return True - return siblings[-1] == path - - def _generate_ds_with_parents(self): # keep track of datasets' parent directories that have already # been yielded, so we do not yield them twice self._visited_parents = set([]) - # precalculate datasets up to max_dataset_depth - all_datasets = self._find_all_datasets() + def _is_max_dataset_depth_reached(self): + return self._current_dataset_depth > self.max_dataset_depth + + @increment_node_count + def generate_nodes(self): + + def exclude_func(path: Path): + """ + Do not traverse the .git folder (we will not find + datasets underneath it) + """ + return self.exclude_node_func(path) or \ + path.is_dir() and path.name == ".git" ds_tree = Tree( self.root, max_depth=None, # unlimited traversal - include_files=False, - include_hidden=True, + exclude_node_func=exclude_func, skip_root=self.skip_root, full_paths=self.full_paths ) for node in ds_tree.generate_nodes(): - ds = node if isinstance(node, DatasetNode): - self._current_dataset_depth = ds.ds_depth + self._current_dataset_depth = node.ds_depth if not self._is_max_dataset_depth_reached(): # yield parent directories if not already done - for depth, parent in enumerate(ds.parents): + for depth, parent in enumerate(node.parents): if depth == 0 and self.skip_root: continue if parent not in self._visited_parents: @@ -545,43 +543,34 @@ def _generate_ds_with_parents(self): yield DirectoryOrDatasetNode( parent, depth, - self._is_last_ds_child(parent, all_datasets), + False, use_full_paths=self.full_paths ) - self._visited_parents.add(ds.path) - ds.is_last_child = self._is_last_ds_child(ds.path, - all_datasets) - yield ds - - def _generate_dataset_nodes(self): - ds_node_generator = self._generate_ds_with_parents() - - for node in ds_node_generator: - if isinstance(node, DatasetNode): - ds = node - # yield dataset contents up to `max_depth` levels - subtree = Tree( - ds.path, - max_depth=self.max_depth, - include_files=self.include_files, - include_hidden=self.include_hidden, - skip_root=True, # dataset root has already been yielded - traverse_datasets=False - ) - for sub_node in subtree.generate_nodes(): - if isinstance(sub_node, DatasetNode): - self._visited_parents.update(sub_node.parents) - yield from ds_node_generator - else: - # offset sub-node depth by the parent's depth - sub_node.depth += ds.depth - yield sub_node + self._visited_parents.add(node.path) + yield node - @increment_node_count - def generate_nodes(self): - for node in self._generate_dataset_nodes(): - yield node + else: + # yield contents of dataset up to `max_depth` levels + if node.path not in self._visited_parents: + # check that the current node is a child of a dataset + parents = [ + parent_depth + for parent_depth, parent in enumerate(node.parents) + if is_dataset(parent) + ] + if parents: + parent_depth = parents[-1] # closest parent + relative_node_depth = node.depth - parent_depth + if relative_node_depth <= self.max_depth: + if isinstance(node, DirectoryNode): + # the current directory is potentially + # a parent of a nested dataset. instead + # of verifying this (would imply advancing + # the generator or retrieving children twice), + # we just add the node to the 'history'. + self._visited_parents.add(node.path) + yield node class DirectoryNode(_TreeNode): @@ -622,9 +611,7 @@ def __init__(self, *args, **kwargs): self.ds = require_dataset(self.path, check_installed=False) self.is_installed = self.ds.is_installed() - self.ds_depth = None - self.ds_absolute_depth = None - self.calculate_dataset_depth() + self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() def __str__(self): install_flag = ", not installed" if not self.is_installed else "" @@ -637,9 +624,8 @@ def calculate_dataset_depth(self): 1. subdataset depth relative to the tree root 2. absolute subdataset depth in the full hierarchy """ - # TODO: run this already in the constructor - self.ds_depth = 0 - self.ds_absolute_depth = 0 + ds_depth = 0 + ds_absolute_depth = 0 ds = self.ds @@ -655,10 +641,12 @@ def calculate_dataset_depth(self): # it is a top-level dataset, we are done break - self.ds_absolute_depth += 1 + ds_absolute_depth += 1 if Path(superds.path).is_relative_to(self.tree_root): # if the parent dataset is underneath the tree # root, we increment the relative depth - self.ds_depth += 1 + ds_depth += 1 ds = superds + + return ds_depth, ds_absolute_depth From a4bf0b14e301a650c73cb26a54535492c3d014ff Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 01:24:27 +0200 Subject: [PATCH 035/131] reimplement dataset tree using exclusion function (all tests are passing) --- datalad_next/tree.py | 140 ++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 68 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 7facdec4..293a098a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -183,7 +183,7 @@ def build_excluded_node_func(include_hidden=False, include_files=False): def is_excluded(path): return any(( - path.is_file() if not include_files else False, + not path.is_dir() if not include_files else False, path.name.startswith(".") if not include_hidden else False )) @@ -345,7 +345,7 @@ def _default_exclude_func(path: Path): """ By default, only include non-hidden directories. """ - return any((path.is_file(), path.name.startswith("."))) + return any((not path.is_dir(), path.name.startswith("."))) def _get_depth(self, path: Path): """Directory depth of current path relative to root of the tree""" @@ -379,7 +379,8 @@ def build(self): return self def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): - """Recursively yield directory tree starting from ```dir_path``""" + """Recursively yield directory tree nodes starting from ``dir_path``""" + if not self.skip_root or \ self.skip_root and self._get_depth(dir_path) > 0: yield DirectoryOrDatasetNode( @@ -402,11 +403,12 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): # (needed for displaying special end-of-subtree prefix) is_last_child = (child == children[-1]) - if child.is_file(): - yield FileNode(child, self._get_depth(child), is_last_child) - elif child.is_dir(): + if child.is_dir(): # recurse into subdirectories yield from self._generate_tree_nodes(child, is_last_child) + else: + yield FileNode(child, self._get_depth(child), is_last_child) + @increment_node_count def generate_nodes(self): @@ -496,81 +498,83 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): self.max_depth = 0 self.max_dataset_depth = max_dataset_depth - # meaning of 'dataset depth': - # -1 means no datasets encountered, - # 0 means top-level superdatasets (relative to the tree root), - # 1 means first-level subdatasets (relative to the tree root). - self._current_dataset_depth = -1 # -1 means no datasets encountered - - # keep track of datasets' parent directories that have already - # been yielded, so we do not yield them twice - self._visited_parents = set([]) - - def _is_max_dataset_depth_reached(self): - return self._current_dataset_depth > self.max_dataset_depth @increment_node_count def generate_nodes(self): + def is_git_folder(path: Path): + """Do not traverse the .git folder (we will not find + datasets underneath it)""" + return path.is_dir() and path.name == ".git" + + def ds_exceeds_max_ds_depth(path: Path): + """Exclude datasets with ds_depth > max_ds_depth""" + if path.is_dir() and is_dataset(path): + ds = DatasetNode(path, self._get_depth(path), False) + return ds.ds_depth > self.max_dataset_depth + return False + + def ds_child_exceeds_max_depth(path: Path): + """Exclude files or directories underneath a dataset, + if they have depth (relative to dataset root) > max_depth""" + if not path.is_dir() or not is_dataset(path): + node = _TreeNode(path, self._get_depth(path), False) + ds_parents = [p for p in node.parents if is_dataset(p)] + if ds_parents: + parent = ds_parents[-1] # closest parent + relative_depth = node.depth - self._get_depth(parent) + return relative_depth > self.max_depth + return True + + def is_ds_parent_with_depth(path: Path): + """Exclude directory if it is a parent of a dataset with + ds_depth > max_ds_depth""" + + def exclude(p: Path): + is_parent_or_child = path.is_relative_to(p) or p.is_relative_to(path) + return not is_parent_or_child or is_git_folder(path) + + if path.is_dir() and not is_dataset(path): + # search in the subtree that includes the current path + subtree = Tree( + self.root, + max_depth=None, + exclude_node_func=exclude, + skip_root=True + ) + first_ds_child = next(( + node + for node in subtree.generate_nodes() + if isinstance(node, DatasetNode) and path in node.parents + ), None) + + return first_ds_child is not None and \ + first_ds_child.ds_depth <= self.max_dataset_depth + return False + def exclude_func(path: Path): - """ - Do not traverse the .git folder (we will not find - datasets underneath it) - """ - return self.exclude_node_func(path) or \ - path.is_dir() and path.name == ".git" + criteria = self.exclude_node_func(path) + + if path.is_dir() and is_dataset(path): + criteria |= ds_exceeds_max_ds_depth(path) + else: + criteria |= is_git_folder(path) + + if not path.is_dir() or \ + not is_ds_parent_with_depth(path): + criteria |= ds_child_exceeds_max_depth(path) + + return criteria ds_tree = Tree( self.root, - max_depth=None, # unlimited traversal + max_depth=None, # unlimited traversal (datasets could be anywhere) exclude_node_func=exclude_func, skip_root=self.skip_root, full_paths=self.full_paths ) - for node in ds_tree.generate_nodes(): - if isinstance(node, DatasetNode): - self._current_dataset_depth = node.ds_depth - - if not self._is_max_dataset_depth_reached(): - # yield parent directories if not already done - for depth, parent in enumerate(node.parents): - if depth == 0 and self.skip_root: - continue - if parent not in self._visited_parents: - self._visited_parents.add(parent) - - yield DirectoryOrDatasetNode( - parent, - depth, - False, - use_full_paths=self.full_paths - ) - - self._visited_parents.add(node.path) - yield node - - else: - # yield contents of dataset up to `max_depth` levels - if node.path not in self._visited_parents: - # check that the current node is a child of a dataset - parents = [ - parent_depth - for parent_depth, parent in enumerate(node.parents) - if is_dataset(parent) - ] - if parents: - parent_depth = parents[-1] # closest parent - relative_node_depth = node.depth - parent_depth - if relative_node_depth <= self.max_depth: - if isinstance(node, DirectoryNode): - # the current directory is potentially - # a parent of a nested dataset. instead - # of verifying this (would imply advancing - # the generator or retrieving children twice), - # we just add the node to the 'history'. - self._visited_parents.add(node.path) - yield node + yield from ds_tree.generate_nodes() class DirectoryNode(_TreeNode): From 993999a55b2c59075ac6f1a285327ce3a18e49b1 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 02:19:10 +0200 Subject: [PATCH 036/131] optimize search of child datasets by starting from the current path as subtree root --- datalad_next/tree.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 293a098a..c50b9238 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -289,9 +289,7 @@ def __str__(self): def tree_root(self): """Calculate tree root path from node path and depth""" parents = self.parents - if parents: - return parents[0] - return self.path # we are the root + return parents[0] if parents else self.path # we are the root @property def parents(self): @@ -531,25 +529,32 @@ def is_ds_parent_with_depth(path: Path): ds_depth > max_ds_depth""" def exclude(p: Path): - is_parent_or_child = path.is_relative_to(p) or p.is_relative_to(path) - return not is_parent_or_child or is_git_folder(path) + return not p.is_dir() or is_git_folder(p) if path.is_dir() and not is_dataset(path): - # search in the subtree that includes the current path + # search in the subtree with the current path as root subtree = Tree( - self.root, + path, max_depth=None, exclude_node_func=exclude, skip_root=True ) - first_ds_child = next(( - node - for node in subtree.generate_nodes() - if isinstance(node, DatasetNode) and path in node.parents - ), None) - - return first_ds_child is not None and \ - first_ds_child.ds_depth <= self.max_dataset_depth + + def child_datasets(): + for node in subtree.generate_nodes(): + if isinstance(node, DatasetNode): + # offset depth by depth of current path + node.depth += self._get_depth(path) + # need to recalculate dataset depth after + # updating directory depth + node.ds_depth, _ = node.calculate_dataset_depth() + yield node + + return any( + ds.ds_depth <= self.max_dataset_depth + for ds in child_datasets() + ) + return False def exclude_func(path: Path): From c892152cf76e5178b4ed6f71a1f6e308d708a90f Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 07:42:26 +0200 Subject: [PATCH 037/131] update command parameter help texts --- datalad_next/tree.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index c50b9238..40733bab 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -91,7 +91,10 @@ class TreeCommand(Interface): depth=Parameter( args=("-L", "--depth",), doc="""maximum level of directory tree to display. - If not specified, will display all levels.""", + If not specified, will display all levels. + If paired with [CMD: --dataset-depth CMD][PY: dataset_depth PY], + refers to the maximum directory level to display underneath each + dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("-R", "--dataset-depth",), @@ -119,13 +122,14 @@ class TreeCommand(Interface): code_cmd="datalad tree -L 3 --include-files"), dict(text="List all first- and second-level subdatasets " "of parent datasets located anywhere under /tmp, " - "regardless of directory depth", - code_py="tree('/tmp', dataset_depth=2, depth=0, full_paths=True)", - code_cmd="datalad tree /tmp -R 2 -L 0 --full-paths"), - dict(text="Display first- and second-level subdatasets and their" - "contents up to 3 directories deep (within each subdataset)", - code_py="tree('.', dataset_depth=2, directory_depth=1)", - code_cmd="datalad tree -R 2 -L 3"), + "regardless of directory depth, " + "including in hidden directories", + code_py="tree('/tmp', dataset_depth=2, include_hidden=True)", + code_cmd="datalad tree /tmp -R 2 --include-hidden"), + dict(text="Display first- and second-level subdatasets and their " + "contents, up to 1 directory deep within each dataset", + code_py="tree(dataset_depth=2, depth=1)", + code_cmd="datalad tree -R 2 -L 1"), ] @staticmethod From 59592bb84ee48ce80febadb39cb6a931bd4d9219 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 20:19:32 +0200 Subject: [PATCH 038/131] generate stats string automatically based on subclasses of _TreeNode --- datalad_next/tests/test_tree.py | 6 +- datalad_next/tree.py | 141 +++++++++++++++++++------------- 2 files changed, 86 insertions(+), 61 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index a3a6b92c..e63b1fed 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -279,7 +279,7 @@ def format_param_ids(val): matrix_ds = [ { "depth": 1, - "expected_stats_str": "2 datasets, 1 directories, 0 files", + "expected_stats_str": "2 datasets, 1 directory, 0 files", "expected_str": """ ├── repo0/ ├── superds0/ [DS~0] @@ -318,7 +318,7 @@ def format_param_ids(val): { "dataset_depth": 0, "depth": 1, - "expected_stats_str": "3 datasets, 1 directories, 0 files", + "expected_stats_str": "3 datasets, 1 directory, 0 files", "expected_str": """ ├── superds0/ [DS~0] └── superds1/ [DS~0] @@ -329,7 +329,7 @@ def format_param_ids(val): { "dataset_depth": 1, "depth": 0, - "expected_stats_str": "6 datasets, 1 directories, 0 files", + "expected_stats_str": "6 datasets, 1 directory, 0 files", "expected_str": """ ├── superds0/ [DS~0] | └── sd0_subds0/ [DS~1] diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 40733bab..d816d9ce 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -17,22 +17,22 @@ see which directories are datalad datasets, so that I can locate my datasets in the context of the whole directory layout. ---- - This is basically what is implemented by the `tree-datalad` utility -- - just `tree` with visual markers for datasets. + This is basically just `tree` with visual markers for datasets. In addition to it, `datalad-tree` provides the following: 1. The subdataset hierarchy level information (included in the dataset marker, e.g. [DS~0]). This is the absolute level, meaning it may take into account - superdatasets that are not included in the display. - 2. The option to list only directories that are datasets + superdatasets that are located above the tree root and thus are + not included in the display. + 2. The option to list only directories that are datasets. 3. The count of displayed datasets in the "report line" - (where `tree` only reports count of directories and files) + (where `tree` only reports count of directories and files). (2) Descriptor of nested subdataset hierarchies: --- As a datalad user, I want to visualize the structure of multiple datasets and their hierarchies at once based on the subdataset nesting level, - regardless of their actual depth in the directory tree. This helps me + regardless of their depth in the directory tree. This helps me understand and communicate the layout of my datasets. --- This is the more datalad-specific case. Here we redefine 'depth' as the @@ -108,10 +108,6 @@ class TreeCommand(Interface): args=("-a", "--include-hidden",), doc="""include hidden files/directories in output""", action='store_true'), - full_paths=Parameter( - args=("--full-paths",), - doc="""display the full path for files/directories""", - action='store_true'), ) _examples_ = [ @@ -142,7 +138,6 @@ def __call__( dataset_depth=None, include_files=False, include_hidden=False, - full_paths=False, ): if dataset_depth is not None: @@ -150,7 +145,6 @@ def __call__( path, max_depth=depth, max_dataset_depth=dataset_depth, - full_paths=full_paths, exclude_node_func=build_excluded_node_func( include_hidden=include_hidden, include_files=include_files ) @@ -159,7 +153,6 @@ def __call__( tree = Tree( path, max_depth=depth, - full_paths=full_paths, exclude_node_func=build_excluded_node_func( include_hidden=include_hidden, include_files=include_files ) @@ -180,9 +173,9 @@ def __call__( def build_excluded_node_func(include_hidden=False, include_files=False): """ - Default callable to filter tree nodes. - Takes a Path object of the tree node as input. - If the function returns true, the node will not be displayed. + Returns a callable to filter tree nodes. + The callable takes the Path object of a tree node as input, + and returns true if the node should not be displayed in the tree. """ def is_excluded(path): @@ -249,24 +242,24 @@ def is_empty_dir(): if superds is not None: return True - # TODO: do we have a way to detect a datalad dataset if it - # is not installed and it is not a subdataset? + # TODO: is there a way to detect a datalad dataset if it + # is not installed and it is not a subdataset? For now, we don't return False -class _TreeNode(object): +class _TreeNode: """ Base class for a directory or file represented as a single tree node and printed as single line of the 'tree' output. """ COLOR = None # ANSI color for the path, if terminal color are enabled + PREFIX_MIDDLE_CHILD = "├── " # symbol for tip of the 'tree branch' + PREFIX_LAST_CHILD = "└── " - def __init__(self, path: str, depth: int, is_last_child: bool, - use_full_paths=False): - self.path = path # path relative to tree root - self.depth = depth # depth in the directory tree - self.is_last_child = is_last_child # if it is last item of its subtree - self.use_full_paths = use_full_paths + def __init__(self, path: str, depth: int, is_last_child: bool): + self.path = path + self.depth = depth # depth in the Tree + self.is_last_child = is_last_child # if is last sibling in its subtree def __eq__(self, other): return self.path == other.path @@ -275,7 +268,7 @@ def __hash__(self): return hash(str(self.path)) def __str__(self): - if self.depth == 0 or self.use_full_paths: + if self.depth == 0: path = self.path else: path = os.path.basename(self.path) @@ -285,15 +278,24 @@ def __str__(self): prefix = "" if self.depth > 0: - prefix = "└── " if self.is_last_child else "├── " + prefix = self.PREFIX_LAST_CHILD if self.is_last_child \ + else self.PREFIX_MIDDLE_CHILD return prefix + str(path) + @staticmethod + def stats_description(count): + """String describing the node count that will be + included in the tree's report line""" + raise NotImplementedError + # return str(count) + (" node" if int(count) == 1 else " nodes") + @property def tree_root(self): """Calculate tree root path from node path and depth""" parents = self.parents - return parents[0] if parents else self.path # we are the root + return parents[0] if parents \ + else self.path # we are the root @property def parents(self): @@ -310,7 +312,7 @@ def parents(self): return parents_from_tree_root[::-1] # top-down order -class Tree(object): +class Tree: """ Main class for building and serializing a directory tree. Does not store ``_TreeNode`` objects, only the string representation @@ -318,8 +320,8 @@ class Tree(object): """ def __init__(self, - root: Path, max_depth=None, - full_paths=False, + root: Path, + max_depth=None, skip_root=False, exclude_node_func=None): @@ -332,25 +334,26 @@ def __init__(self, if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - self.full_paths = full_paths - self.skip_root = skip_root # do not print first line with root path + # do not print first line with root path + self.skip_root = skip_root - # set default filter criteria + # set custom or default filter criteria self.exclude_node_func = exclude_node_func or self._default_exclude_func - self._lines = [] # list of lines of output string - # TODO: stats should automatically register all concrete _TreeNode classes - self._stats = {"DirectoryNode": 0, "DatasetNode": 0, "FileNode": 0} + # store list of lines of output string + self._lines = [] + + # dict with count of nodes for each _TreeNode subtype + self._stats = {node_type.__name__: 0 + for node_type in _TreeNode.__subclasses__()} @staticmethod def _default_exclude_func(path: Path): - """ - By default, only include non-hidden directories. - """ + """By default, only include non-hidden directories""" return any((not path.is_dir(), path.name.startswith("."))) def _get_depth(self, path: Path): - """Directory depth of current path relative to root of the tree""" + """Get directory depth of current path relative to root of the tree""" return len(path.relative_to(self.root).parts) def _max_depth_reached(self, path): @@ -360,18 +363,22 @@ def _max_depth_reached(self, path): def stats(self): """ - Equivalent of tree command's 'report line' at the end of the - tree output. + Returns a string with counts of different node types, similar + to the tree command's 'report line' at the end of the tree + output. The 3 node types (directory, dataset, file) are mutually exclusive, so their sum equals to the total node count. Does not count the root itself, only the contents below the root. """ - return f"{self._stats['DatasetNode']} datasets, " \ - f"{self._stats['DirectoryNode']} directories, " \ - f"{self._stats['FileNode']} files" - - def _total_nodes(self): - return sum(c for c in self._stats.values()) + # sort node type names alphabetically + node_types = sorted( + _TreeNode.__subclasses__(), + key=lambda c: c.__name__ + ) + return ", ".join( + node_type.stats_description(self._stats[node_type.__name__]) + for node_type in node_types + ) def build(self): """ @@ -489,8 +496,10 @@ def _yield_lines(self): class DatasetTree(Tree): """ - DatasetTree is a Tree where hierarchy depth refers to the + ``DatasetTree`` is a ``Tree`` where hierarchy depth refers to the subdataset hierarchy level, instead of directory depth. + Because of the different semantics of the ``max_depth`` parameter, + we implement a separate subclass of ``Tree``. """ def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) @@ -520,11 +529,14 @@ def ds_child_exceeds_max_depth(path: Path): """Exclude files or directories underneath a dataset, if they have depth (relative to dataset root) > max_depth""" if not path.is_dir() or not is_dataset(path): - node = _TreeNode(path, self._get_depth(path), False) - ds_parents = [p for p in node.parents if is_dataset(p)] + ds_parents = [ + p for p in path.parents + if p.is_relative_to(self.root) and + is_dataset(p) + ] if ds_parents: parent = ds_parents[-1] # closest parent - relative_depth = node.depth - self._get_depth(parent) + relative_depth = self._get_depth(path) - self._get_depth(parent) return relative_depth > self.max_depth return True @@ -571,6 +583,7 @@ def exclude_func(path: Path): if not path.is_dir() or \ not is_ds_parent_with_depth(path): + pass criteria |= ds_child_exceeds_max_depth(path) return criteria @@ -580,7 +593,6 @@ def exclude_func(path: Path): max_depth=None, # unlimited traversal (datasets could be anywhere) exclude_node_func=exclude_func, skip_root=self.skip_root, - full_paths=self.full_paths ) yield from ds_tree.generate_nodes() @@ -598,13 +610,21 @@ def __str__(self): return string + "/" return string + @staticmethod + def stats_description(count): + return str(count) + (" directory" if int(count) == 1 else " directories") + class FileNode(_TreeNode): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + @staticmethod + def stats_description(count): + return str(count) + (" file" if int(count) == 1 else " files") + -class DirectoryOrDatasetNode(_TreeNode): +class DirectoryOrDatasetNode: """ Factory class for creating either a ``DirectoryNode`` or ``DatasetNode``, based on whether the current path is a dataset or not. @@ -616,7 +636,7 @@ def __new__(cls, path, *args, **kwargs): return DirectoryNode(path, *args, **kwargs) -class DatasetNode(DirectoryNode): +class DatasetNode(_TreeNode): COLOR = ansi_colors.MAGENTA def __init__(self, *args, **kwargs): @@ -628,8 +648,13 @@ def __init__(self, *args, **kwargs): def __str__(self): install_flag = ", not installed" if not self.is_installed else "" - suffix = f" [DS~{self.ds_absolute_depth}{install_flag}]" - return super().__str__() + suffix + dir_suffix = "/" if self.depth > 0 else "" + ds_suffix = f" [DS~{self.ds_absolute_depth}{install_flag}]" + return super().__str__() + dir_suffix + ds_suffix + + @staticmethod + def stats_description(count): + return str(count) + (" dataset" if int(count) == 1 else " datasets") def calculate_dataset_depth(self): """ From 602f1618e69a5d717da6fd724e54b0bd54ebd2de Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 20:41:01 +0200 Subject: [PATCH 039/131] extract symbols for indentation display --- datalad_next/tests/test_tree.py | 58 ++++++++++++++++----------------- datalad_next/tree.py | 16 ++++++--- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index e63b1fed..e3b9cd1d 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -191,16 +191,16 @@ def format_param_ids(val): "expected_str": """ ├── dir0/ ├── dir1/ -| └── dir1_file0 +│ └── dir1_file0 ├── dir2/ -| ├── dir2_dir0/ -| ├── dir2_dir1/ -| | └── dir2_dir1_file0 -| ├── dir2_dir2/ -| | ├── dir2_dir2_file0 -| | └── dir2_dir2_file1 -| ├── dir2_file0 -| └── dir2_file1 +│ ├── dir2_dir0/ +│ ├── dir2_dir1/ +│ │ └── dir2_dir1_file0 +│ ├── dir2_dir2/ +│ │ ├── dir2_dir2_file0 +│ │ └── dir2_dir2_file1 +│ ├── dir2_file0 +│ └── dir2_file1 ├── file0 └── file1 """ @@ -227,21 +227,21 @@ def format_param_ids(val): "expected_stats_str": "0 datasets, 7 directories, 11 files", "expected_str": """ ├── .dir3/ -| ├── .dir3_file1 -| └── dir3_file0 +│ ├── .dir3_file1 +│ └── dir3_file0 ├── .file2 ├── dir0/ ├── dir1/ -| └── dir1_file0 +│ └── dir1_file0 ├── dir2/ -| ├── dir2_dir0/ -| ├── dir2_dir1/ -| | └── dir2_dir1_file0 -| ├── dir2_dir2/ -| | ├── dir2_dir2_file0 -| | └── dir2_dir2_file1 -| ├── dir2_file0 -| └── dir2_file1 +│ ├── dir2_dir0/ +│ ├── dir2_dir1/ +│ │ └── dir2_dir1_file0 +│ ├── dir2_dir2/ +│ │ ├── dir2_dir2_file0 +│ │ └── dir2_dir2_file1 +│ ├── dir2_file0 +│ └── dir2_file1 ├── file0 └── file1 """ @@ -292,12 +292,12 @@ def format_param_ids(val): "expected_str": """ ├── repo0/ ├── superds0/ [DS~0] -| └── sd0_subds0/ [DS~1] -| └── sd0_sub0_subds0/ [DS~2] +│ └── sd0_subds0/ [DS~1] +│ └── sd0_sub0_subds0/ [DS~2] └── superds1/ [DS~0] ├── sd1_dir0/ - | ├── sd1_d0_repo0/ - | └── sd1_d0_subds0/ [DS~1] + │ ├── sd1_d0_repo0/ + │ └── sd1_d0_subds0/ [DS~1] ├── sd1_ds0/ [DS~0] └── sd1_subds0/ [DS~1, not installed] """, @@ -332,10 +332,10 @@ def format_param_ids(val): "expected_stats_str": "6 datasets, 1 directory, 0 files", "expected_str": """ ├── superds0/ [DS~0] -| └── sd0_subds0/ [DS~1] +│ └── sd0_subds0/ [DS~1] └── superds1/ [DS~0] ├── sd1_dir0/ - | └── sd1_d0_subds0/ [DS~1] + │ └── sd1_d0_subds0/ [DS~1] ├── sd1_ds0/ [DS~0] └── sd1_subds0/ [DS~1, not installed] """ @@ -346,11 +346,11 @@ def format_param_ids(val): "expected_stats_str": "6 datasets, 2 directories, 0 files", "expected_str": """ ├── superds0/ [DS~0] -| └── sd0_subds0/ [DS~1] +│ └── sd0_subds0/ [DS~1] └── superds1/ [DS~0] ├── sd1_dir0/ - | ├── sd1_d0_repo0/ - | └── sd1_d0_subds0/ [DS~1] + │ ├── sd1_d0_repo0/ + │ └── sd1_d0_subds0/ [DS~1] ├── sd1_ds0/ [DS~0] └── sd1_subds0/ [DS~1, not installed] """ diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d816d9ce..2682847b 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -253,8 +253,14 @@ class _TreeNode: tree node and printed as single line of the 'tree' output. """ COLOR = None # ANSI color for the path, if terminal color are enabled - PREFIX_MIDDLE_CHILD = "├── " # symbol for tip of the 'tree branch' + # symbols for the tip of the 'tree branch', depending on + # whether a node is the last in it subtree or not + PREFIX_MIDDLE_CHILD = "├── " PREFIX_LAST_CHILD = "└── " + # symbol for representing the continuation of a 'tree branch' + INDENTATION_SYMBOL = "│" + # space between the indentation symbol of one level and the next + INDENTATION_SPACING = " " def __init__(self, path: str, depth: int, is_last_child: bool): self.path = path @@ -418,7 +424,6 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): else: yield FileNode(child, self._get_depth(child), is_last_child) - @increment_node_count def generate_nodes(self): """ @@ -481,11 +486,12 @@ def _yield_lines(self): # build indentation string indentation = "" + spacing = node.INDENTATION_SPACING if node.depth > 0: indentation_symbols_for_levels = [ - " " - if level in levels_with_exhausted_subtree - else "| " + (node.INDENTATION_SYMBOL + if level not in levels_with_exhausted_subtree + else " ") + spacing for level in range(1, node.depth) ] indentation = "".join(indentation_symbols_for_levels) From 18092946cadff65018fc1ff968576576def63273 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 21:05:01 +0200 Subject: [PATCH 040/131] replace remaining usages of os module with pathlib --- datalad_next/tests/test_tree.py | 37 +++++++++++++++------------------ datalad_next/tree.py | 9 ++++---- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index e3b9cd1d..ea7d646c 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,5 +1,3 @@ -import os -from os.path import join as opj from datetime import datetime from pathlib import Path @@ -9,7 +7,7 @@ from datalad.tests.utils_pytest import ( assert_raises, assert_str_equal, - with_tree, assert_re_in + with_tree ) from datalad.utils import rmtemp @@ -20,11 +18,12 @@ """ -def create_temp_dir_tree(tree_dict): +def create_temp_dir_tree(tree_dict: dict) -> Path: """ Create a temporary directory tree. This is a shim for the 'with_tree' decorator so it can be used in a module-scoped pytest fixture. + Returns the Path object of the root directory. """ # function to be decorated by 'with_tree' # just return the argument (will be the created temp path) @@ -37,7 +36,7 @@ def create_temp_dir_tree(tree_dict): # call the 'with_tree' decorator to return the path # of the created temp dir root, without deleting it temp_dir_root = with_tree(tree_dict, delete=False)(identity_func)() - return temp_dir_root + return Path(temp_dir_root).resolve() @pytest.fixture(scope="module") @@ -52,7 +51,7 @@ def path_no_ds(): "dir3_file0": '', ".dir3_file1": '', }, - "dir0": {}, # empty dir + "dir0": {}, "dir1": { "dir1_file0": '', }, @@ -75,9 +74,9 @@ def path_no_ds(): } temp_dir_root = create_temp_dir_tree(dir_tree) - yield Path(temp_dir_root).resolve() + yield temp_dir_root rmtemp(temp_dir_root) - assert not os.path.exists(temp_dir_root) + assert not temp_dir_root.exists() @pytest.fixture(scope="module") @@ -112,25 +111,25 @@ def path_ds(): temp_dir_root = create_temp_dir_tree(ds_tree) # create datasets / repos - root = opj(temp_dir_root, "root") - BasicGitTestRepo(path=opj(root, "repo0"), puke_if_exists=False) - superds0 = Dataset(opj(root, "superds0")).create(force=True) + root = temp_dir_root / "root" + BasicGitTestRepo(path=root / "repo0", puke_if_exists=False) + superds0 = Dataset(root / "superds0").create(force=True) sd0_subds0 = superds0.create("sd0_subds0", force=True) sd0_subds0.create("sd0_sub0_subds0", force=True) - superds1 = Dataset(opj(root, "superds1")).create(force=True) - superds1.create(opj("sd1_dir0", "sd1_d0_subds0"), force=True) - Dataset(opj(root, "superds1", "sd1_ds0")).create(force=True) + superds1 = Dataset(root / "superds1").create(force=True) + superds1.create(Path("sd1_dir0") / "sd1_d0_subds0", force=True) + Dataset(root / "superds1" / "sd1_ds0").create(force=True) BasicGitTestRepo( - path=opj(root, "superds1", "sd1_dir0", "sd1_d0_repo0"), + path=root / "superds1" / "sd1_dir0" / "sd1_d0_repo0", puke_if_exists=False) sd1_subds0 = superds1.create("sd1_subds0", force=True) sd1_subds0.drop(what='all', reckless='kill', recursive=True) - yield Path(temp_dir_root).resolve() + yield temp_dir_root # delete temp dir rmtemp(temp_dir_root) - assert not os.path.exists(temp_dir_root) + assert not temp_dir_root.exists() def format_param_ids(val): @@ -381,7 +380,7 @@ def build_param_matrix(matrix, params): def test_print_tree_with_params_no_ds( path_no_ds, depth, include_files, include_hidden, expected_str ): - root = os.path.join(path_no_ds, "root") + root = Path(path_no_ds) / "root" tree = Tree( root, max_depth=depth, exclude_node_func=build_excluded_node_func( @@ -541,5 +540,3 @@ def test_print_stats_with_max_dataset_depth( actual_res = tree.stats() expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) - - diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 2682847b..405c4733 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -43,7 +43,6 @@ __docformat__ = 'restructuredtext' import logging -import os from functools import wraps, lru_cache from pathlib import Path @@ -142,7 +141,7 @@ def __call__( if dataset_depth is not None: tree = DatasetTree( - path, + Path(path), max_depth=depth, max_dataset_depth=dataset_depth, exclude_node_func=build_excluded_node_func( @@ -151,7 +150,7 @@ def __call__( ) else: tree = Tree( - path, + Path(path), max_depth=depth, exclude_node_func=build_excluded_node_func( include_hidden=include_hidden, include_files=include_files @@ -262,7 +261,7 @@ class _TreeNode: # space between the indentation symbol of one level and the next INDENTATION_SPACING = " " - def __init__(self, path: str, depth: int, is_last_child: bool): + def __init__(self, path: Path, depth: int, is_last_child: bool): self.path = path self.depth = depth # depth in the Tree self.is_last_child = is_last_child # if is last sibling in its subtree @@ -277,7 +276,7 @@ def __str__(self): if self.depth == 0: path = self.path else: - path = os.path.basename(self.path) + path = self.path.name if self.COLOR is not None: path = ansi_colors.color_word(path, self.COLOR) From bc0316bbbdbb5976ef141a4038fbeaf977a42250 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 22:31:55 +0200 Subject: [PATCH 041/131] refactor test suite using test classes --- datalad_next/tests/test_tree.py | 435 +++++++++++++++++--------------- 1 file changed, 237 insertions(+), 198 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index ea7d646c..57ae7582 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -14,10 +14,12 @@ from ..tree import Tree, DatasetTree, build_excluded_node_func """ -Tests for datalad tree. +Tests for the ``datalad tree`` command. """ +# ============================ Helper functions =============================== + def create_temp_dir_tree(tree_dict: dict) -> Path: """ Create a temporary directory tree. @@ -43,7 +45,7 @@ def create_temp_dir_tree(tree_dict: dict) -> Path: def path_no_ds(): """ Fixture for temporary directory tree including nested - directories, without datasets + directories, without datasets. """ dir_tree = { "root": { @@ -83,7 +85,7 @@ def path_no_ds(): def path_ds(): """ Fixture for temporary directory tree including nested - directories and datasets + directories and datasets. """ ds_tree = { "root": { @@ -132,18 +134,93 @@ def path_ds(): assert not temp_dir_root.exists() +@pytest.fixture(scope="class") +def inject_path_no_ds(request, path_no_ds): + """ + Set path_no_ds fixture (root path of temp directory tree) as + class attribute, to make it available to all tests in the class + """ + request.cls.path = path_no_ds + + +@pytest.fixture(scope="class") +def inject_path_ds(request, path_ds): + request.cls.path = path_ds + + def format_param_ids(val): - """Helper to format pytest parameter IDs. + """ + Helper to format pytest parameter IDs. If the parameter is a multiline string, we assume it is the parameter 'expected' (expected output of tree), and just - give it a fixed ID.""" + give it a fixed ID. Otherwise, it will be displayed in the + parameter list as a long unreadable string. + """ if isinstance(val, str) and "\n" in val: return "expected" -# combinations of parameters to be tested and their expected results. -# (2 levels per param) ** (3 params) = 8 combinations + 8 expected results -matrix_no_ds = [ +def build_param_matrix(matrix, params): + """Turn inner dicts into lists (required by pytest parametrize)""" + matrix_out = [] + for combination in matrix: + matrix_out.append( + # order of combinations does not matter + [val for key, val in combination.items() if key in params] + ) + return matrix_out + + +def pytest_generate_tests(metafunc): + # see: https://docs.pytest.org/en/7.1.x/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration + if metafunc.cls: + test_id = metafunc.function.__name__ + test_params_dict = metafunc.cls.params + matrix = metafunc.cls.MATRIX + if test_id in metafunc.cls.params: + param_names = test_params_dict[test_id] + metafunc.parametrize( + param_names, + build_param_matrix(matrix, param_names), + ids=format_param_ids + ) + +# ================================= Tests ===================================== + +def test_print_tree_fails_for_nonexistent_directory(): + """Obtain nonexistent directory by creating a temp dir + and deleting it (may be safest method)""" + dir_name = f"to_be_deleted_{datetime.now().timestamp()}" + nonexistent_dir = with_tree({dir_name: []})(lambda f: f)() + with assert_raises(ValueError): + Tree(nonexistent_dir, max_depth=1) + + +class TestTree: + """Base class with tests that should run for all Tree configurations""" + __test__ = False # tells pytest to not collect tests in this class + path = None # will be set by the inject_* fixture to temp dir tree root + + # a map specifying multiple argument sets for a test method + params = { + "test_print_tree": [ + "depth", "include_files", "include_hidden", "expected_str" + ], + "test_print_stats": [ + "depth", "include_files", "include_hidden", "expected_stats_str" + ] + } + + +@pytest.mark.usefixtures("inject_path_no_ds") +class TestTreeWithoutDatasets(TestTree): + """Test directory tree without any datasets""" + + __test__ = True + + # matrix holds combinations of parameters to be tested + # and their expected results + MATRIX = [ { "depth": 1, "include_files": False, @@ -272,10 +349,83 @@ def format_param_ids(val): └── dir2_dir2/ """ }, -] + ] + + def test_print_tree( + self, depth, include_files, include_hidden, expected_str + ): + root = Path(self.path) / "root" + tree = Tree( + root, max_depth=depth, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ), + skip_root=True # skip the first line with the root directory + ) + lines = tree.print_line() + actual_res = "\n".join(line for line in lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline + print("expected:") + print(expected_res) + print("actual:") + print(actual_res) + assert_str_equal(expected_res, actual_res) + + def test_print_stats( + self, depth, include_files, include_hidden, expected_stats_str + ): + root = self.path / 'root' + tree = Tree( + root, max_depth=depth, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files + ), + ).build() + actual_res = tree.stats() + expected_res = expected_stats_str + assert_str_equal(expected_res, actual_res) + + @pytest.mark.parametrize( + "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] + ) + def test_root_path_is_normalized(self, root_dir_name): + """ + Test that root path in the first line of string output + is normalized path + """ + root = self.path / root_dir_name + tree = Tree(root, max_depth=0) + expected = str(self.path / "root") + actual = next(tree.print_line()) # first line of tree output + assert_str_equal(expected, actual) + + def test_tree_to_string(self): + root = self.path / 'root' + tree = Tree(root, 3) + actual = tree.to_string() + expected = "\n".join(tree._lines) + assert_str_equal(expected, actual) + + def test_print_tree_depth_zero(self): + root = self.path / "root" + tree = Tree( + root, + max_depth=0, + # including files should have no effect + exclude_node_func=build_excluded_node_func(include_files=True) + ) + actual = tree.to_string() + expected = str(root) + assert_str_equal(expected, actual) -# for trees with datasets, we test the dataset-specific options -matrix_ds = [ + +@pytest.mark.usefixtures("inject_path_ds") +class TestTreeWithDatasets(TestTree): + """Test directory tree with datasets""" + + __test__ = True + + MATRIX = [ { "depth": 1, "expected_stats_str": "2 datasets, 1 directory, 0 files", @@ -301,9 +451,47 @@ def format_param_ids(val): └── sd1_subds0/ [DS~1, not installed] """, }, -] + ] + + params = { + "test_print_tree": [ + "depth", "expected_str" + ], + "test_print_stats": [ + "depth", "expected_stats_str" + ] + } + + def test_print_tree( + self, depth, expected_str + ): + root = self.path / "root" + tree = Tree( + root, max_depth=depth, + skip_root=True # skip the first line with the root directory + ) + lines = tree.print_line() + actual_res = "\n".join(l for l in lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline + assert_str_equal(expected_res, actual_res) + + def test_print_stats( + self, depth, expected_stats_str + ): + root = self.path / 'root' + tree = Tree(root, max_depth=depth).build() + actual_res = tree.stats() + expected_res = expected_stats_str + assert_str_equal(expected_res, actual_res) -matrix_max_ds_depth = [ + +@pytest.mark.usefixtures("inject_path_ds") +class TestDatasetTree(TestTree): + """Test dataset tree with max_dataset_depth parameter""" + + __test__ = True + + MATRIX = [ { "dataset_depth": 0, "depth": 0, @@ -354,189 +542,40 @@ def format_param_ids(val): └── sd1_subds0/ [DS~1, not installed] """ }, -] - - -def build_param_matrix(matrix, params): - """Turn inner dicts into lists (required by pytest parametrize)""" - matrix_out = [] - for combination in matrix: - matrix_out.append( - # order of combinations does not matter - [val for key, val in combination.items() if key in params] - ) - return matrix_out - - -# ================== Test directory tree without datasets ================== - -param_names = ["depth", "include_files", "include_hidden", "expected_str"] - - -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_no_ds, param_names), - ids=format_param_ids -) -def test_print_tree_with_params_no_ds( - path_no_ds, depth, include_files, include_hidden, expected_str -): - root = Path(path_no_ds) / "root" - tree = Tree( - root, max_depth=depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ), - skip_root=True # skip the first line with the root directory - ) - lines = tree.print_line() - actual_res = "\n".join(l for l in lines) + "\n" - expected_res = expected_str.lstrip("\n") # strip first newline - print("expected:") - print(expected_res) - print("actual:") - print(actual_res) - assert_str_equal(expected_res, actual_res) - - -@pytest.mark.parametrize( - "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] -) -def test_root_path_is_normalized(path_no_ds, root_dir_name): - """ - Test that root path in the first line of string output - is normalized path - """ - root = path_no_ds / root_dir_name - tree = Tree(root, max_depth=0) - expected = str(path_no_ds / "root") - actual = next(tree.print_line()) # first line of tree output - assert_str_equal(expected, actual) - - -def test_print_tree_fails_for_nonexistent_directory(): - """Obtain nonexistent directory by creating a temp dir - and deleting it (may be safest method)""" - dir_name = f"to_be_deleted_{datetime.now().timestamp()}" - nonexistent_dir = with_tree({dir_name: []})(lambda f: f)() - with assert_raises(ValueError): - Tree(nonexistent_dir, max_depth=1) - - -param_names = ["depth", "include_files", "include_hidden", "expected_stats_str"] - -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_no_ds, param_names) -) -def test_print_stats_no_ds( - path_no_ds, depth, include_files, include_hidden, expected_stats_str -): - root = path_no_ds / 'root' - tree = Tree( - root, max_depth=depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ), - ).build() - actual_res = tree.stats() - expected_res = expected_stats_str - assert_str_equal(expected_res, actual_res) - - -def test_tree_to_string(path_no_ds): - root = path_no_ds / 'root' - tree = Tree(root, 3) - actual = tree.to_string() - expected = "\n".join(tree._lines) - assert_str_equal(expected, actual) - - -# ================== Test directory tree with datasets ================== - -param_names = ["depth", "expected_str"] - - -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_ds, param_names), - ids=format_param_ids -) -def test_print_tree_with_params_with_ds( - path_ds, depth, expected_str -): - root = path_ds / "root" - tree = Tree( - root, max_depth=depth, - skip_root=True # skip the first line with the root directory - ) - lines = tree.print_line() - actual_res = "\n".join(l for l in lines) + "\n" - expected_res = expected_str.lstrip("\n") # strip first newline - assert_str_equal(expected_res, actual_res) - - -param_names = ["depth", "expected_stats_str"] - - -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_ds, param_names) -) -def test_print_stats_with_ds( - path_ds, depth, expected_stats_str -): - root = path_ds / 'root' - tree = Tree(root, max_depth=depth).build() - actual_res = tree.stats() - expected_res = expected_stats_str - assert_str_equal(expected_res, actual_res) - - -def test_print_tree_depth_zero(path_no_ds): - root = path_no_ds / "root" - tree = Tree( - root, - max_depth=0, - # including files should have no effect - exclude_node_func=build_excluded_node_func(include_files=True) - ) - actual = tree.to_string() - expected = str(root) - assert_str_equal(expected, actual) - - -param_names = ["dataset_depth", "depth", "expected_str"] - + ] + + params = { + "test_print_tree": [ + "dataset_depth", "depth", "expected_str" + ], + "test_print_stats": [ + "dataset_depth", "depth", "expected_stats_str" + ] + } -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_max_ds_depth, param_names), - ids=format_param_ids -) -def test_print_tree_with_max_dataset_depth( - path_ds, dataset_depth, depth, expected_str -): - root = path_ds / "root" - tree = DatasetTree( - root, max_depth=depth, max_dataset_depth=dataset_depth, - skip_root=True) - lines = tree.print_line() - actual_res = "\n".join(l for l in lines) + "\n" - expected_res = expected_str.lstrip("\n") # strip first newline - print("expected:") - print(expected_res) - print("actual:") - print(actual_res) - assert_str_equal(expected_res, actual_res) - - -param_names = ["dataset_depth", "depth", "expected_stats_str"] - - -@pytest.mark.parametrize( - param_names, build_param_matrix(matrix_max_ds_depth, param_names) -) -def test_print_stats_with_max_dataset_depth( - path_ds, dataset_depth, depth, expected_stats_str -): - root = path_ds / 'root' - tree = DatasetTree(root, max_depth=depth, max_dataset_depth=dataset_depth).build() - actual_res = tree.stats() - expected_res = expected_stats_str - assert_str_equal(expected_res, actual_res) + def test_print_tree( + self, dataset_depth, depth, expected_str + ): + root = self.path / "root" + tree = DatasetTree( + root, max_depth=depth, max_dataset_depth=dataset_depth, + skip_root=True) + lines = tree.print_line() + actual_res = "\n".join(l for l in lines) + "\n" + expected_res = expected_str.lstrip("\n") # strip first newline + print("expected:") + print(expected_res) + print("actual:") + print(actual_res) + assert_str_equal(expected_res, actual_res) + + def test_print_stats( + self, dataset_depth, depth, expected_stats_str + ): + root = self.path / 'root' + tree = DatasetTree( + root, max_depth=depth, max_dataset_depth=dataset_depth + ).build() + actual_res = tree.stats() + expected_res = expected_stats_str + assert_str_equal(expected_res, actual_res) From 04717c3a5f154d1ce8d3106559b7956622afbc3e Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 23:37:52 +0200 Subject: [PATCH 042/131] cleanup for readability --- datalad_next/tree.py | 81 +++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 405c4733..be7c1485 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -343,7 +343,7 @@ def __init__(self, self.skip_root = skip_root # set custom or default filter criteria - self.exclude_node_func = exclude_node_func or self._default_exclude_func + self.exclude_node_func = exclude_node_func or self.default_exclude_func # store list of lines of output string self._lines = [] @@ -353,19 +353,14 @@ def __init__(self, for node_type in _TreeNode.__subclasses__()} @staticmethod - def _default_exclude_func(path: Path): - """By default, only include non-hidden directories""" + def default_exclude_func(path: Path): + """By default, only include non-hidden directories, no files""" return any((not path.is_dir(), path.name.startswith("."))) - def _get_depth(self, path: Path): - """Get directory depth of current path relative to root of the tree""" + def path_depth(self, path: Path): + """Get directory depth of a given path relative to root of the tree""" return len(path.relative_to(self.root).parts) - def _max_depth_reached(self, path): - if self.max_depth is None: - return False # unlimited depth - return self._get_depth(path) >= self.max_depth - def stats(self): """ Returns a string with counts of different node types, similar @@ -385,23 +380,19 @@ def stats(self): for node_type in node_types ) - def build(self): - """ - Construct the tree string representation and return back the instance. - """ - self.to_string() - return self - def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): """Recursively yield directory tree nodes starting from ``dir_path``""" if not self.skip_root or \ - self.skip_root and self._get_depth(dir_path) > 0: + self.skip_root and self.path_depth(dir_path) > 0: yield DirectoryOrDatasetNode( - dir_path, self._get_depth(dir_path), is_last_child + dir_path, self.path_depth(dir_path), is_last_child ) - if not self._max_depth_reached(dir_path): + # check that we are within max_depth levels + # (None means unlimited depth) + if self.max_depth is None or \ + self.path_depth(dir_path) < self.max_depth: # apply exclusion filter selected_children = ( @@ -421,7 +412,7 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): # recurse into subdirectories yield from self._generate_tree_nodes(child, is_last_child) else: - yield FileNode(child, self._get_depth(child), is_last_child) + yield FileNode(child, self.path_depth(child), is_last_child) @increment_node_count def generate_nodes(self): @@ -438,6 +429,13 @@ def generate_nodes(self): for node in self._generate_tree_nodes(self.root): yield node + def build(self): + """ + Construct the tree string representation and return back the instance. + """ + self.to_string() + return self + def to_string(self): """Return complete tree as string""" if not self._lines: @@ -517,23 +515,24 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): @increment_node_count def generate_nodes(self): + """ + A ``DatasetTree`` is just an unlimited-depth ``Tree`` with more + complex rules for excluding nodes. Each exclusion rule is encoded + in a function. The rules are then combined in a final + ``exclusion_func`` which is supplied to the ``Tree`` constructor. + """ def is_git_folder(path: Path): - """Do not traverse the .git folder (we will not find - datasets underneath it)""" return path.is_dir() and path.name == ".git" def ds_exceeds_max_ds_depth(path: Path): - """Exclude datasets with ds_depth > max_ds_depth""" if path.is_dir() and is_dataset(path): - ds = DatasetNode(path, self._get_depth(path), False) + ds = DatasetNode(path, self.path_depth(path), False) return ds.ds_depth > self.max_dataset_depth return False - def ds_child_exceeds_max_depth(path: Path): - """Exclude files or directories underneath a dataset, - if they have depth (relative to dataset root) > max_depth""" - if not path.is_dir() or not is_dataset(path): + def ds_child_node_exceeds_max_depth(path: Path): + if not (path.is_dir() and is_dataset(path)): ds_parents = [ p for p in path.parents if p.is_relative_to(self.root) and @@ -541,14 +540,11 @@ def ds_child_exceeds_max_depth(path: Path): ] if ds_parents: parent = ds_parents[-1] # closest parent - relative_depth = self._get_depth(path) - self._get_depth(parent) + relative_depth = self.path_depth(path) - self.path_depth(parent) return relative_depth > self.max_depth return True - def is_ds_parent_with_depth(path: Path): - """Exclude directory if it is a parent of a dataset with - ds_depth > max_ds_depth""" - + def is_parent_of_included_ds(path: Path): def exclude(p: Path): return not p.is_dir() or is_git_folder(p) @@ -562,10 +558,11 @@ def exclude(p: Path): ) def child_datasets(): + """Generator of dataset nodes""" for node in subtree.generate_nodes(): if isinstance(node, DatasetNode): # offset depth by depth of current path - node.depth += self._get_depth(path) + node.depth += self.path_depth(path) # need to recalculate dataset depth after # updating directory depth node.ds_depth, _ = node.calculate_dataset_depth() @@ -579,17 +576,23 @@ def child_datasets(): return False def exclude_func(path: Path): + """Combine exclusion criteria from different functions""" criteria = self.exclude_node_func(path) if path.is_dir() and is_dataset(path): + # check if maximum dataset depth is exceeded criteria |= ds_exceeds_max_ds_depth(path) else: + # do not traverse the .git folder (we will not find + # datasets underneath it) criteria |= is_git_folder(path) - if not path.is_dir() or \ - not is_ds_parent_with_depth(path): - pass - criteria |= ds_child_exceeds_max_depth(path) + # exclude files or directories underneath a dataset, + # if they have depth (relative to dataset root) > max_depth, + # unless they are themselves parents of a dataset with + # dataset depth within the valid ds_depth range + if not (path.is_dir() and is_parent_of_included_ds(path)): + criteria |= ds_child_node_exceeds_max_depth(path) return criteria From c7fba5c6df229baf51330238ff1fee38c4d63083 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 26 Jul 2022 23:48:34 +0200 Subject: [PATCH 043/131] move placement of class declarations --- datalad_next/tree.py | 166 +++++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index be7c1485..df8b51dc 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -246,77 +246,6 @@ def is_empty_dir(): return False -class _TreeNode: - """ - Base class for a directory or file represented as a single - tree node and printed as single line of the 'tree' output. - """ - COLOR = None # ANSI color for the path, if terminal color are enabled - # symbols for the tip of the 'tree branch', depending on - # whether a node is the last in it subtree or not - PREFIX_MIDDLE_CHILD = "├── " - PREFIX_LAST_CHILD = "└── " - # symbol for representing the continuation of a 'tree branch' - INDENTATION_SYMBOL = "│" - # space between the indentation symbol of one level and the next - INDENTATION_SPACING = " " - - def __init__(self, path: Path, depth: int, is_last_child: bool): - self.path = path - self.depth = depth # depth in the Tree - self.is_last_child = is_last_child # if is last sibling in its subtree - - def __eq__(self, other): - return self.path == other.path - - def __hash__(self): - return hash(str(self.path)) - - def __str__(self): - if self.depth == 0: - path = self.path - else: - path = self.path.name - - if self.COLOR is not None: - path = ansi_colors.color_word(path, self.COLOR) - - prefix = "" - if self.depth > 0: - prefix = self.PREFIX_LAST_CHILD if self.is_last_child \ - else self.PREFIX_MIDDLE_CHILD - - return prefix + str(path) - - @staticmethod - def stats_description(count): - """String describing the node count that will be - included in the tree's report line""" - raise NotImplementedError - # return str(count) + (" node" if int(count) == 1 else " nodes") - - @property - def tree_root(self): - """Calculate tree root path from node path and depth""" - parents = self.parents - return parents[0] if parents \ - else self.path # we are the root - - @property - def parents(self): - """ - List of parent paths (beginning from the tree root) - in top-down order. - """ - parents_from_tree_root = [] - for depth, path in enumerate(Path(self.path).parents): - if depth >= self.depth: - break - parents_from_tree_root.append(path) - - return parents_from_tree_root[::-1] # top-down order - - class Tree: """ Main class for building and serializing a directory tree. @@ -606,6 +535,77 @@ def exclude_func(path: Path): yield from ds_tree.generate_nodes() +class _TreeNode: + """ + Base class for a directory or file represented as a single + tree node and printed as single line of the 'tree' output. + """ + COLOR = None # ANSI color for the path, if terminal color are enabled + # symbols for the tip of the 'tree branch', depending on + # whether a node is the last in it subtree or not + PREFIX_MIDDLE_CHILD = "├── " + PREFIX_LAST_CHILD = "└── " + # symbol for representing the continuation of a 'tree branch' + INDENTATION_SYMBOL = "│" + # space between the indentation symbol of one level and the next + INDENTATION_SPACING = " " + + def __init__(self, path: Path, depth: int, is_last_child: bool): + self.path = path + self.depth = depth # depth in the Tree + self.is_last_child = is_last_child # if is last sibling in its subtree + + def __eq__(self, other): + return self.path == other.path + + def __hash__(self): + return hash(str(self.path)) + + def __str__(self): + if self.depth == 0: + path = self.path + else: + path = self.path.name + + if self.COLOR is not None: + path = ansi_colors.color_word(path, self.COLOR) + + prefix = "" + if self.depth > 0: + prefix = self.PREFIX_LAST_CHILD if self.is_last_child \ + else self.PREFIX_MIDDLE_CHILD + + return prefix + str(path) + + @staticmethod + def stats_description(count): + """String describing the node count that will be + included in the tree's report line""" + raise NotImplementedError + # return str(count) + (" node" if int(count) == 1 else " nodes") + + @property + def tree_root(self): + """Calculate tree root path from node path and depth""" + parents = self.parents + return parents[0] if parents \ + else self.path # we are the root + + @property + def parents(self): + """ + List of parent paths (beginning from the tree root) + in top-down order. + """ + parents_from_tree_root = [] + for depth, path in enumerate(Path(self.path).parents): + if depth >= self.depth: + break + parents_from_tree_root.append(path) + + return parents_from_tree_root[::-1] # top-down order + + class DirectoryNode(_TreeNode): COLOR = ansi_colors.BLUE @@ -632,18 +632,6 @@ def stats_description(count): return str(count) + (" file" if int(count) == 1 else " files") -class DirectoryOrDatasetNode: - """ - Factory class for creating either a ``DirectoryNode`` or ``DatasetNode``, - based on whether the current path is a dataset or not. - """ - def __new__(cls, path, *args, **kwargs): - if is_dataset(path): - return DatasetNode(path, *args, **kwargs) - else: - return DirectoryNode(path, *args, **kwargs) - - class DatasetNode(_TreeNode): COLOR = ansi_colors.MAGENTA @@ -696,3 +684,15 @@ def calculate_dataset_depth(self): ds = superds return ds_depth, ds_absolute_depth + + +class DirectoryOrDatasetNode: + """ + Factory class for creating either a ``DirectoryNode`` or ``DatasetNode``, + based on whether the current path is a dataset or not. + """ + def __new__(cls, path, *args, **kwargs): + if is_dataset(path): + return DatasetNode(path, *args, **kwargs) + else: + return DirectoryNode(path, *args, **kwargs) From 31856d283690c5c5d612d40cac5e36e5a2715442 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 27 Jul 2022 01:02:03 +0200 Subject: [PATCH 044/131] clean up remaining vestiges of non-Path paths --- datalad_next/tree.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index df8b51dc..534860a1 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -220,12 +220,13 @@ def is_dataset(path): Only consider datalad datasets, not plain git/git-annex repos. """ ds = require_dataset(path, check_installed=False) + ds_path = Path(ds.path) # detect if it is an installed datalad-proper dataset # (as opposed to git/git-annex repo). # could also query `ds.id`, but checking just for existence # of config file is quicker. - if Path(Path(ds.path) / ".datalad" / "config").is_file(): + if Path(ds_path / ".datalad" / "config").is_file(): return True # if it is not installed, check if it has an installed superdataset. @@ -233,7 +234,7 @@ def is_dataset(path): # directory has the .git folder), we check if the directory # is empty (faster) -- as e.g. after a non-recursive `datalad clone` def is_empty_dir(): - return not any(Path(ds.path).iterdir()) + return not any(ds_path.iterdir()) if is_empty_dir(): superds = ds.get_superdataset(datalad_only=True, topmost=False, @@ -259,8 +260,7 @@ def __init__(self, skip_root=False, exclude_node_func=None): - # TODO: root should already be given as Path object - self.root = Path(root).resolve() + self.root = root.resolve() if not self.root.is_dir(): raise ValueError(f"directory '{root}' not found") @@ -325,7 +325,7 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): # apply exclusion filter selected_children = ( - p for p in Path(dir_path).iterdir() + p for p in dir_path.iterdir() if not self.exclude_node_func(p) ) # sort directory contents alphabetically @@ -598,7 +598,7 @@ def parents(self): in top-down order. """ parents_from_tree_root = [] - for depth, path in enumerate(Path(self.path).parents): + for depth, path in enumerate(self.path.parents): if depth >= self.depth: break parents_from_tree_root.append(path) From daf19572b97bb1541840338352acbcbc857a3725 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 27 Jul 2022 01:03:07 +0200 Subject: [PATCH 045/131] cache results of ds.get_superdatasets() which brings a modest speed improvement --- datalad_next/tree.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 534860a1..3ac5ce2a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -237,9 +237,7 @@ def is_empty_dir(): return not any(ds_path.iterdir()) if is_empty_dir(): - superds = ds.get_superdataset(datalad_only=True, topmost=False, - registered_only=True) - if superds is not None: + if get_superdataset(ds) is not None: return True # TODO: is there a way to detect a datalad dataset if it @@ -247,6 +245,14 @@ def is_empty_dir(): return False +# cache this since we will re-run it +# on same dataset multiple times +@lru_cache +def get_superdataset(ds): + return ds.get_superdataset( + datalad_only=True, topmost=False, registered_only=True) + + class Tree: """ Main class for building and serializing a directory tree. @@ -664,8 +670,7 @@ def calculate_dataset_depth(self): ds = self.ds while ds: - superds = ds.get_superdataset( - datalad_only=True, topmost=False, registered_only=True) + superds = get_superdataset(ds) if superds is None: # it is not a dataset, do nothing From 9a938f0168f05444a357a7b43731f46b68bde40b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 27 Jul 2022 07:42:37 +0200 Subject: [PATCH 046/131] use Path obj in test --- datalad_next/tests/test_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 57ae7582..f11339ee 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -191,7 +191,7 @@ def test_print_tree_fails_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" dir_name = f"to_be_deleted_{datetime.now().timestamp()}" - nonexistent_dir = with_tree({dir_name: []})(lambda f: f)() + nonexistent_dir = Path(with_tree({dir_name: []})(lambda f: f)()) with assert_raises(ValueError): Tree(nonexistent_dir, max_depth=1) From 0ccac9bd4d2e8f04853c9961d71b58cc35403466 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 00:03:32 +0200 Subject: [PATCH 047/131] improve docstrings, convert to numpy format --- datalad_next/tests/test_tree.py | 72 +++++-- datalad_next/tree.py | 357 ++++++++++++++++++++------------ 2 files changed, 271 insertions(+), 158 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index f11339ee..cdabc150 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -13,9 +13,7 @@ from ..tree import Tree, DatasetTree, build_excluded_node_func -""" -Tests for the ``datalad tree`` command. -""" +"""Tests for the ``datalad tree`` command.""" # ============================ Helper functions =============================== @@ -23,9 +21,19 @@ def create_temp_dir_tree(tree_dict: dict) -> Path: """ Create a temporary directory tree. - This is a shim for the 'with_tree' decorator so it can be used + + This is a shim for the ``with_tree()`` decorator so it can be used in a module-scoped pytest fixture. - Returns the Path object of the root directory. + + Parameters + ---------- + tree_dict: dict + A dict describing a directory tree (see parameter of ``with_tree``) + + Returns + ------- + Path + Root directory of the newly created tree """ # function to be decorated by 'with_tree' # just return the argument (will be the created temp path) @@ -43,9 +51,13 @@ def create_temp_dir_tree(tree_dict: dict) -> Path: @pytest.fixture(scope="module") def path_no_ds(): - """ - Fixture for temporary directory tree including nested - directories, without datasets. + """Fixture for creating a temporary directory tree (**without** datasets) + to be used in tests. + + Returns + ------- + Path + Root directory of the newly created tree """ dir_tree = { "root": { @@ -83,9 +95,13 @@ def path_no_ds(): @pytest.fixture(scope="module") def path_ds(): - """ - Fixture for temporary directory tree including nested - directories and datasets. + """Fixture for creating a temporary directory tree (**including** datasets) + to be used in tests. + + Returns + ------- + Path + Root directory of the newly created tree """ ds_tree = { "root": { @@ -137,24 +153,34 @@ def path_ds(): @pytest.fixture(scope="class") def inject_path_no_ds(request, path_no_ds): """ - Set path_no_ds fixture (root path of temp directory tree) as - class attribute, to make it available to all tests in the class + Set ``path_no_ds`` fixture (root path of temp directory tree) as class + attribute, to make it available to all tests in the class """ request.cls.path = path_no_ds @pytest.fixture(scope="class") def inject_path_ds(request, path_ds): + """ + Set ``path_ds`` fixture (root path of temp directory tree) as class + attribute, to make it available to all tests in the class + """ request.cls.path = path_ds -def format_param_ids(val): +def format_param_ids(val) -> str: """ Helper to format pytest parameter IDs. + If the parameter is a multiline string, we assume it is the parameter 'expected' (expected output of tree), and just - give it a fixed ID. Otherwise, it will be displayed in the - parameter list as a long unreadable string. + give it a fixed ID (otherwise, it would be displayed in the + parameter list as a long unreadable string). + + Parameters + ---------- + val + Parameter value """ if isinstance(val, str) and "\n" in val: return "expected" @@ -172,7 +198,13 @@ def build_param_matrix(matrix, params): def pytest_generate_tests(metafunc): - # see: https://docs.pytest.org/en/7.1.x/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration + """Pytest helper to automatically configure parametrization. + + Avoids having to duplicate definition of parameter names and values + across tests that use the same data. + + See: https://docs.pytest.org/en/7.1.x/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration + """ if metafunc.cls: test_id = metafunc.function.__name__ test_params_dict = metafunc.cls.params @@ -188,8 +220,8 @@ def pytest_generate_tests(metafunc): # ================================= Tests ===================================== def test_print_tree_fails_for_nonexistent_directory(): - """Obtain nonexistent directory by creating a temp dir - and deleting it (may be safest method)""" + """Obtain nonexistent directory by creating a temp dir and deleting it + (may be safest method)""" dir_name = f"to_be_deleted_{datetime.now().timestamp()}" nonexistent_dir = Path(with_tree({dir_name: []})(lambda f: f)()) with assert_raises(ValueError): @@ -201,7 +233,7 @@ class TestTree: __test__ = False # tells pytest to not collect tests in this class path = None # will be set by the inject_* fixture to temp dir tree root - # a map specifying multiple argument sets for a test method + # dict specifying multiple argument sets for a test method params = { "test_print_tree": [ "depth", "include_files", "include_hidden", "expected_str" diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 3ac5ce2a..eefa7f63 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -6,41 +6,9 @@ # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## -""" -'tree'-like command for visualization of dataset hierarchies. - -This command covers 2 main use cases: - -(1) Glorified `tree` command: - ---- - As a datalad user, I want to list the contents of a directory tree and - see which directories are datalad datasets, so that I can locate my - datasets in the context of the whole directory layout. - ---- - This is basically just `tree` with visual markers for datasets. - In addition to it, `datalad-tree` provides the following: - 1. The subdataset hierarchy level information - (included in the dataset marker, e.g. [DS~0]). - This is the absolute level, meaning it may take into account - superdatasets that are located above the tree root and thus are - not included in the display. - 2. The option to list only directories that are datasets. - 3. The count of displayed datasets in the "report line" - (where `tree` only reports count of directories and files). - -(2) Descriptor of nested subdataset hierarchies: - --- - As a datalad user, I want to visualize the structure of multiple datasets - and their hierarchies at once based on the subdataset nesting level, - regardless of their depth in the directory tree. This helps me - understand and communicate the layout of my datasets. - --- - This is the more datalad-specific case. Here we redefine 'depth' as the - level in the subdataset hierarchy instead of the filesystem hierarchy. - -""" - -__docformat__ = 'restructuredtext' +"""'tree'-like command for visualizing dataset hierarchies""" + +__docformat__ = "numpy" import logging from functools import wraps, lru_cache @@ -74,10 +42,40 @@ class TreeCommand(Interface): """Visualize directory and dataset hierarchies - This command mimics the UNIX/MSDOS 'tree' command to display a - directory tree, highlighting DataLad datasets in the hierarchy. + This command mimics the UNIX/MSDOS ``tree`` utility to display a directory + tree, highlighting DataLad datasets in the hierarchy. + + Two main use cases are covered: + + 1. Glorified ``tree`` command: + + List the contents of a directory tree and see which directories are + datalad datasets (including subdatasets that are present but not + installed, such as after a non-recursive clone). + + This is basically just ``tree`` with visual markers for datasets. In + addition to it, ``datalad-tree`` provides the following: + + - The subdataset hierarchy level is shown in the dataset marker + (e.g. [DS~2]). This is the absolute level, meaning it may also take + into account superdatasets located above the tree root and thus + not included in the output. + - The 'report line' at the bottom of the output shows the count of + displayed datasets, in addition to the count of directories and + files. + + 2. Descriptor of nested subdataset hierarchies: + Visualize the structure of multiple datasets and their hierarchies at + once based on the subdataset nesting level, regardless of their location + in the directory tree. + + In this case, the tree depth is determined by the subdataset depth. + There is also the option to display contents (directories/files) of + each dataset up to max_depth levels, to provide better context around + the datasets. """ + result_renderer = 'tailored' _params_ = dict( @@ -104,21 +102,19 @@ class TreeCommand(Interface): doc="""include files in output display""", action='store_true'), include_hidden=Parameter( - args=("-a", "--include-hidden",), - doc="""include hidden files/directories in output""", + args=("--include-hidden",), + doc="""include hidden files/directories in output display""", action='store_true'), ) _examples_ = [ - dict( - text="Display up to 3 levels of subdirectories and their contents " - "including files, starting from the current directory", - code_py="tree(depth=3, include_files=True)", - code_cmd="datalad tree -L 3 --include-files"), - dict(text="List all first- and second-level subdatasets " - "of parent datasets located anywhere under /tmp, " - "regardless of directory depth, " - "including in hidden directories", + dict(text="Display up to 3 levels of the current directory's " + "subdirectories and their contents", + code_py="tree(depth=3, include_files=True)", + code_cmd="datalad tree -L 3 --include-files"), + dict(text="Display all first- and second-level subdatasets of " + "datasets located anywhere under /tmp (including in hidden " + "directories) regardless of directory depth", code_py="tree('/tmp', dataset_depth=2, include_hidden=True)", code_cmd="datalad tree /tmp -R 2 --include-hidden"), dict(text="Display first- and second-level subdatasets and their " @@ -136,8 +132,7 @@ def __call__( depth=None, dataset_depth=None, include_files=False, - include_hidden=False, - ): + include_hidden=False): if dataset_depth is not None: tree = DatasetTree( @@ -145,16 +140,14 @@ def __call__( max_depth=depth, max_dataset_depth=dataset_depth, exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ) + include_hidden=include_hidden, include_files=include_files) ) else: tree = Tree( Path(path), max_depth=depth, exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ) + include_hidden=include_hidden, include_files=include_files) ) for line in tree.print_line(): @@ -171,10 +164,14 @@ def __call__( def build_excluded_node_func(include_hidden=False, include_files=False): - """ - Returns a callable to filter tree nodes. - The callable takes the Path object of a tree node as input, - and returns true if the node should not be displayed in the tree. + """Return a function to exclude ``_TreeNode`` objects from the tree ( + prevents them from being yielded by the node generator). + + Returns + ------- + Callable + Function that takes the Path object of a ``_TreeNode`` as input, + and returns true if the node should *not* be displayed in the tree. """ def is_excluded(path): @@ -187,9 +184,13 @@ def is_excluded(path): def increment_node_count(node_generator_func): - """ - Decorator for incrementing the node count whenever - a ``_TreeNode`` is generated. + """Decorator for incrementing the node count whenever a ``_TreeNode`` is + yielded. + + Parameters + ---------- + node_generator_func: Callable + Function that yields ``_TreeNode`` objects """ @wraps(node_generator_func) def _wrapper(*args, **kwargs): @@ -208,16 +209,26 @@ def _wrapper(*args, **kwargs): return _wrapper -# whether path is a dataset should not change within -# command execution time, so we cache it @lru_cache -def is_dataset(path): - """ - Fast dataset detection. +def is_dataset(path: Path): + """Fast dataset detection. Infer that a directory is a dataset if it is either: - (A) installed, or - (B) not installed, but has an installed superdatset. + + - installed, or + - not installed, but has an installed superdatset. + Only consider datalad datasets, not plain git/git-annex repos. + + Results are cached because the check is somewhat expensive and may be run + multiple times on the same path. + + TODO: is there a way to detect a datalad dataset if it is not installed + and it is not a subdataset? + + Parameters + ---------- + path: Path + Path to directory to be identified as dataset or non-dataset """ ds = require_dataset(path, check_installed=False) ds_path = Path(ds.path) @@ -240,32 +251,48 @@ def is_empty_dir(): if get_superdataset(ds) is not None: return True - # TODO: is there a way to detect a datalad dataset if it - # is not installed and it is not a subdataset? For now, we don't return False -# cache this since we will re-run it -# on same dataset multiple times @lru_cache def get_superdataset(ds): + """Wrapper for ``Dataset.get_superdataset()`` with predefined options. + + Results are cached, as this function may be rerun on the same dataset + multiple times. + + Parameters + ---------- + ds: Dataset + + Returns + ------- + Dataset or None + """ return ds.get_superdataset( datalad_only=True, topmost=False, registered_only=True) class Tree: - """ - Main class for building and serializing a directory tree. - Does not store ``_TreeNode`` objects, only the string representation - of the whole tree and the statistics (counts of different node types). - """ + """Main class for generating and serializing a directory tree""" def __init__(self, root: Path, max_depth=None, skip_root=False, exclude_node_func=None): - + """ + Parameters + ---------- + root: Path + Directory to be used as tree root + max_depth: int or None + Maximum directory depth for traversing the tree + skip_root: bool + If true, will not print the first line with tree root + exclude_node_func: Callable or None + Function to filter out tree nodes from the tree + """ self.root = root.resolve() if not self.root.is_dir(): raise ValueError(f"directory '{root}' not found") @@ -274,7 +301,6 @@ def __init__(self, if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - # do not print first line with root path self.skip_root = skip_root # set custom or default filter criteria @@ -283,7 +309,7 @@ def __init__(self, # store list of lines of output string self._lines = [] - # dict with count of nodes for each _TreeNode subtype + # store dict with count of nodes for each _TreeNode subtype self._stats = {node_type.__name__: 0 for node_type in _TreeNode.__subclasses__()} @@ -292,17 +318,20 @@ def default_exclude_func(path: Path): """By default, only include non-hidden directories, no files""" return any((not path.is_dir(), path.name.startswith("."))) - def path_depth(self, path: Path): - """Get directory depth of a given path relative to root of the tree""" + def path_depth(self, path: Path) -> int: + """Calculate directory depth of a given path relative to the root of + the tree""" return len(path.relative_to(self.root).parts) - def stats(self): + def stats(self) -> str: """ - Returns a string with counts of different node types, similar + Produces a string with counts of different node types, similar to the tree command's 'report line' at the end of the tree output. - The 3 node types (directory, dataset, file) are mutually exclusive, - so their sum equals to the total node count. + + The node types (subclasses of ``_TreeNode``) are mutually exclusive, + so the sum of their counts equals to the total node count. + Does not count the root itself, only the contents below the root. """ # sort node type names alphabetically @@ -316,7 +345,17 @@ def stats(self): ) def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): - """Recursively yield directory tree nodes starting from ``dir_path``""" + """Recursively yield ``_TreeNode`` objects starting from + ``dir_path`` + + Parameters + ---------- + dir_path: Path + Directory from which to calculate the tree + is_last_child: bool + Whether the directory ``dir_path`` is the last child of its + parent in the ordered list of child nodes + """ if not self.skip_root or \ self.skip_root and self.path_depth(dir_path) > 0: @@ -324,8 +363,8 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): dir_path, self.path_depth(dir_path), is_last_child ) - # check that we are within max_depth levels - # (None means unlimited depth) + # check that we are within max_depth levels (None means unlimited + # depth) if self.max_depth is None or \ self.path_depth(dir_path) < self.max_depth: @@ -339,8 +378,8 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): for child in children: - # check if node is the last child within its subtree - # (needed for displaying special end-of-subtree prefix) + # check if node is the last child within its subtree (needed + # for displaying special end-of-subtree prefix) is_last_child = (child == children[-1]) if child.is_dir(): @@ -355,30 +394,37 @@ def generate_nodes(self): Traverse a directory tree starting from the root path. Yields ``_TreeNode`` objects, each representing a directory or dataset or file. Nodes are traversed in depth-first order. + + Returns + ------- + Generator[_TreeNode] """ - # because the underlying generator is recursive, we cannot - # directly decorate it with `increment_node_count` (since - # it would count twice whenever the function recurses). - # so we decorate a separate function where we just yield - # from the underlying decorator. - for node in self._generate_tree_nodes(self.root): - yield node + # because the node generator is recursive, we cannot directly + # decorate it with `increment_node_count` (since it would count + # twice whenever the function recurses). + # so we decorate a separate function where we just yield from the + # underlying generator. + yield from self._generate_tree_nodes(self.root) def build(self): - """ - Construct the tree string representation and return back the instance. - """ + """Construct the tree string representation (will be stored in + instance attribute) and return the instance.""" self.to_string() return self - def to_string(self): + def to_string(self) -> str: """Return complete tree as string""" if not self._lines: return "\n".join(list(self.print_line())) return "\n".join(self._lines) def print_line(self): - """Generator for tree output lines""" + """Generator for tree string output lines + + Returns + ------- + Generator[str] + """ if not self._lines: # string output has not been generated yet for line in self._yield_lines(): @@ -391,20 +437,26 @@ def print_line(self): yield "\n" # newline at the very end def _yield_lines(self): - """ - Generator of lines of the tree string representation. - Each line represents a node (directory or dataset or file). + """Generator of lines of the tree string representation. + + Each line represents a tree node (directory or dataset or file). + A line follows the structure: ``[] [] `` + Example line: - ``| | ├── path_dir_level3`` + ``│ │ ├── path_dir_level3`` + + Returns + ------- + Generator[str] """ - # keep track of levels where subtree is exhaused, i.e. - # we have reached the last child of the subtree. - # this is needed to build the indentation string for each - # node, which takes into account whether any parent - # is the last node of its own subtree. + # keep track of levels where subtree is exhaused, i.e. we have + # reached the last child of the subtree. + # this is needed to build the indentation string for each node, + # which takes into account whether any parent is the last node of + # its own subtree. levels_with_exhausted_subtree = set([]) for node in self.generate_nodes(): @@ -412,8 +464,8 @@ def _yield_lines(self): if node.is_last_child: # last child of its subtree levels_with_exhausted_subtree.add(node.depth) else: - # 'discard' does not raise exception - # if value does not exist in set + # 'discard' does not raise exception if value does not exist + # in set levels_with_exhausted_subtree.discard(node.depth) # build indentation string @@ -434,15 +486,16 @@ def _yield_lines(self): class DatasetTree(Tree): """ - ``DatasetTree`` is a ``Tree`` where hierarchy depth refers to the + ``DatasetTree`` is a ``Tree`` whose depth is determined by the subdataset hierarchy level, instead of directory depth. + Because of the different semantics of the ``max_depth`` parameter, we implement a separate subclass of ``Tree``. """ def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) - # by default, do not recurse into datasets' subdirectories - # (other than paths to nested subdatasets) + # by default, do not recurse into datasets' subdirectories (other + # than paths to nested subdatasets) if self.max_depth is None: self.max_depth = 0 @@ -451,10 +504,16 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): @increment_node_count def generate_nodes(self): """ + Yield ``_TreeNode`` objects that belong to the tree. + A ``DatasetTree`` is just an unlimited-depth ``Tree`` with more complex rules for excluding nodes. Each exclusion rule is encoded in a function. The rules are then combined in a final ``exclusion_func`` which is supplied to the ``Tree`` constructor. + + Returns + ------- + Generator[_TreeNode] """ def is_git_folder(path: Path): @@ -493,7 +552,7 @@ def exclude(p: Path): ) def child_datasets(): - """Generator of dataset nodes""" + """Generator of dataset nodes below the current node""" for node in subtree.generate_nodes(): if isinstance(node, DatasetNode): # offset depth by depth of current path @@ -542,24 +601,35 @@ def exclude_func(path: Path): class _TreeNode: - """ - Base class for a directory or file represented as a single - tree node and printed as single line of the 'tree' output. + """Base class for a directory or file represented as a single tree node + and printed as single line of the 'tree' output. """ COLOR = None # ANSI color for the path, if terminal color are enabled + # symbols for the tip of the 'tree branch', depending on # whether a node is the last in it subtree or not PREFIX_MIDDLE_CHILD = "├── " PREFIX_LAST_CHILD = "└── " + # symbol for representing the continuation of a 'tree branch' INDENTATION_SYMBOL = "│" - # space between the indentation symbol of one level and the next + # spacing between the indentation symbol of one level and the next INDENTATION_SPACING = " " def __init__(self, path: Path, depth: int, is_last_child: bool): + """ + Parameters + ---------- + path: Path + Path of the tree node + depth: int + Directory depth of the node within its tree + is_last_child: bool + Whether the node is the last node among its parent's children + """ self.path = path - self.depth = depth # depth in the Tree - self.is_last_child = is_last_child # if is last sibling in its subtree + self.depth = depth + self.is_last_child = is_last_child def __eq__(self, other): return self.path == other.path @@ -568,6 +638,7 @@ def __hash__(self): return hash(str(self.path)) def __str__(self): + # display root directory with full path, all other nodes with basename if self.depth == 0: path = self.path else: @@ -585,13 +656,13 @@ def __str__(self): @staticmethod def stats_description(count): - """String describing the node count that will be - included in the tree's report line""" + """String describing the node count that will be included in the + tree's report line""" + # should be implemented by subclasses raise NotImplementedError - # return str(count) + (" node" if int(count) == 1 else " nodes") @property - def tree_root(self): + def tree_root(self) -> Path: """Calculate tree root path from node path and depth""" parents = self.parents return parents[0] if parents \ @@ -599,9 +670,12 @@ def tree_root(self): @property def parents(self): - """ - List of parent paths (beginning from the tree root) - in top-down order. + """List of parent paths (beginning from the tree root) in top-down + order. + + Returns + ------- + List[Path] """ parents_from_tree_root = [] for depth, path in enumerate(self.path.parents): @@ -651,8 +725,8 @@ def __init__(self, *args, **kwargs): def __str__(self): install_flag = ", not installed" if not self.is_installed else "" dir_suffix = "/" if self.depth > 0 else "" - ds_suffix = f" [DS~{self.ds_absolute_depth}{install_flag}]" - return super().__str__() + dir_suffix + ds_suffix + ds_marker = f" [DS~{self.ds_absolute_depth}{install_flag}]" + return super().__str__() + dir_suffix + ds_marker @staticmethod def stats_description(count): @@ -661,8 +735,16 @@ def stats_description(count): def calculate_dataset_depth(self): """ Calculate 2 measures of a dataset's nesting depth/level: - 1. subdataset depth relative to the tree root - 2. absolute subdataset depth in the full hierarchy + + 1. ``ds_depth``: subdataset depth relative to the tree root + 2. ``ds_absolute_depth``: absolute subdataset depth in the full + hierarchy, potentially taking into account parent datasets at + levels above the tree root + + Returns + ------- + Tuple[int, int] + Tuple of relative dataset depth and absolute dataset depth """ ds_depth = 0 ds_absolute_depth = 0 @@ -692,9 +774,8 @@ def calculate_dataset_depth(self): class DirectoryOrDatasetNode: - """ - Factory class for creating either a ``DirectoryNode`` or ``DatasetNode``, - based on whether the current path is a dataset or not. + """Factory class for creating either a ``DirectoryNode`` or + ``DatasetNode``, based on whether the path is a dataset or not. """ def __new__(cls, path, *args, **kwargs): if is_dataset(path): From 0329d2dd2d79b416e454d147b8ad1cc5aff98f36 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 00:05:03 +0200 Subject: [PATCH 048/131] specify explicit command name in entrypoints (otherwise, does not generate man page and breaks build of sphinx docs) --- datalad_next/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_next/__init__.py b/datalad_next/__init__.py index 959019c8..d89aa34f 100644 --- a/datalad_next/__init__.py +++ b/datalad_next/__init__.py @@ -33,6 +33,8 @@ 'datalad_next.tree', # name of the command class implementation in above module 'TreeCommand', + # command name (differs from lowercase command class name) + 'tree' ) ] ) From b19319dc3255703742c0eb885e5ce316bcd307fe Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 00:11:07 +0200 Subject: [PATCH 049/131] rewording in docstring --- datalad_next/tree.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index eefa7f63..31eac46c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -49,7 +49,7 @@ class TreeCommand(Interface): 1. Glorified ``tree`` command: - List the contents of a directory tree and see which directories are + Display the contents of a directory tree and see which directories are datalad datasets (including subdatasets that are present but not installed, such as after a non-recursive clone). @@ -66,11 +66,11 @@ class TreeCommand(Interface): 2. Descriptor of nested subdataset hierarchies: - Visualize the structure of multiple datasets and their hierarchies at - once based on the subdataset nesting level, regardless of their location - in the directory tree. + Display the structure of multiple datasets and their hierarchies based + on subdataset nesting level, regardless of their location in the + directory tree. - In this case, the tree depth is determined by the subdataset depth. + In this case, the tree depth is determined by subdataset depth. There is also the option to display contents (directories/files) of each dataset up to max_depth levels, to provide better context around the datasets. From 72867632d5045b370c614ae43b3e6a1943a053fc Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 00:11:53 +0200 Subject: [PATCH 050/131] remove short-form options for now (TBD) --- datalad_next/tree.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 31eac46c..7b048b25 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -86,7 +86,7 @@ class TreeCommand(Interface): Defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), depth=Parameter( - args=("-L", "--depth",), + args=("--depth",), doc="""maximum level of directory tree to display. If not specified, will display all levels. If paired with [CMD: --dataset-depth CMD][PY: dataset_depth PY], @@ -94,7 +94,7 @@ class TreeCommand(Interface): dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( - args=("-R", "--dataset-depth",), + args=("--dataset-depth",), doc="""maximum level of nested subdatasets to display""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( @@ -111,16 +111,16 @@ class TreeCommand(Interface): dict(text="Display up to 3 levels of the current directory's " "subdirectories and their contents", code_py="tree(depth=3, include_files=True)", - code_cmd="datalad tree -L 3 --include-files"), + code_cmd="datalad tree --depth 3 --include-files"), dict(text="Display all first- and second-level subdatasets of " "datasets located anywhere under /tmp (including in hidden " "directories) regardless of directory depth", code_py="tree('/tmp', dataset_depth=2, include_hidden=True)", - code_cmd="datalad tree /tmp -R 2 --include-hidden"), + code_cmd="datalad tree /tmp --dataset-depth 2 --include-hidden"), dict(text="Display first- and second-level subdatasets and their " "contents, up to 1 directory deep within each dataset", code_py="tree(dataset_depth=2, depth=1)", - code_cmd="datalad tree -R 2 -L 1"), + code_cmd="datalad tree --dataset-depth 2 --depth 1"), ] @staticmethod From 7a5fb88d7c56e594ced239bd0a7b29cf7354890a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 01:57:34 +0200 Subject: [PATCH 051/131] replace pathlib function with implementation compatible with python<3.9 --- datalad_next/tree.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 7b048b25..f0976c85 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -273,6 +273,15 @@ def get_superdataset(ds): datalad_only=True, topmost=False, registered_only=True) +def is_path_relative_to(my_path: Path, other_path: Path): + """Copy of pathlib's ``Path.is_relative_to()`` that requires python3.9+""" + try: + my_path.relative_to(other_path) + return True + except ValueError: + return False + + class Tree: """Main class for generating and serializing a directory tree""" @@ -529,7 +538,7 @@ def ds_child_node_exceeds_max_depth(path: Path): if not (path.is_dir() and is_dataset(path)): ds_parents = [ p for p in path.parents - if p.is_relative_to(self.root) and + if is_path_relative_to(p, self.root) and is_dataset(p) ] if ds_parents: @@ -763,7 +772,7 @@ def calculate_dataset_depth(self): break ds_absolute_depth += 1 - if Path(superds.path).is_relative_to(self.tree_root): + if is_path_relative_to(Path(superds.path), self.tree_root): # if the parent dataset is underneath the tree # root, we increment the relative depth ds_depth += 1 From f19133794e06c991f9e4d724734a5d4b825aab20 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 28 Jul 2022 20:25:44 +0200 Subject: [PATCH 052/131] change appearance of dataset marker, place before path for tidier display --- datalad_next/tests/test_tree.py | 54 ++++++++++++++++----------------- datalad_next/tree.py | 28 +++++++++++------ 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index cdabc150..ba2df050 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -463,8 +463,8 @@ class TestTreeWithDatasets(TestTree): "expected_stats_str": "2 datasets, 1 directory, 0 files", "expected_str": """ ├── repo0/ -├── superds0/ [DS~0] -└── superds1/ [DS~0] +├── [DS~0] superds0/ +└── [DS~0] superds1/ """, }, { @@ -472,15 +472,15 @@ class TestTreeWithDatasets(TestTree): "expected_stats_str": "7 datasets, 3 directories, 0 files", "expected_str": """ ├── repo0/ -├── superds0/ [DS~0] -│ └── sd0_subds0/ [DS~1] -│ └── sd0_sub0_subds0/ [DS~2] -└── superds1/ [DS~0] +├── [DS~0] superds0/ +│ └── [DS~1] sd0_subds0/ +│ └── [DS~2] sd0_sub0_subds0/ +└── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ - │ └── sd1_d0_subds0/ [DS~1] - ├── sd1_ds0/ [DS~0] - └── sd1_subds0/ [DS~1, not installed] + │ └── [DS~1] sd1_d0_subds0/ + ├── [DS~0] sd1_ds0/ + └── [DS~1] (not installed) sd1_subds0/ """, }, ] @@ -529,9 +529,9 @@ class TestDatasetTree(TestTree): "depth": 0, "expected_stats_str": "3 datasets, 0 directories, 0 files", "expected_str": """ -├── superds0/ [DS~0] -└── superds1/ [DS~0] - └── sd1_ds0/ [DS~0] +├── [DS~0] superds0/ +└── [DS~0] superds1/ + └── [DS~0] sd1_ds0/ """ }, { @@ -539,10 +539,10 @@ class TestDatasetTree(TestTree): "depth": 1, "expected_stats_str": "3 datasets, 1 directory, 0 files", "expected_str": """ -├── superds0/ [DS~0] -└── superds1/ [DS~0] +├── [DS~0] superds0/ +└── [DS~0] superds1/ ├── sd1_dir0/ - └── sd1_ds0/ [DS~0] + └── [DS~0] sd1_ds0/ """ }, { @@ -550,13 +550,13 @@ class TestDatasetTree(TestTree): "depth": 0, "expected_stats_str": "6 datasets, 1 directory, 0 files", "expected_str": """ -├── superds0/ [DS~0] -│ └── sd0_subds0/ [DS~1] -└── superds1/ [DS~0] +├── [DS~0] superds0/ +│ └── [DS~1] sd0_subds0/ +└── [DS~0] superds1/ ├── sd1_dir0/ - │ └── sd1_d0_subds0/ [DS~1] - ├── sd1_ds0/ [DS~0] - └── sd1_subds0/ [DS~1, not installed] + │ └── [DS~1] sd1_d0_subds0/ + ├── [DS~0] sd1_ds0/ + └── [DS~1] (not installed) sd1_subds0/ """ }, { @@ -564,14 +564,14 @@ class TestDatasetTree(TestTree): "depth": 2, "expected_stats_str": "6 datasets, 2 directories, 0 files", "expected_str": """ -├── superds0/ [DS~0] -│ └── sd0_subds0/ [DS~1] -└── superds1/ [DS~0] +├── [DS~0] superds0/ +│ └── [DS~1] sd0_subds0/ +└── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ - │ └── sd1_d0_subds0/ [DS~1] - ├── sd1_ds0/ [DS~0] - └── sd1_subds0/ [DS~1, not installed] + │ └── [DS~1] sd1_d0_subds0/ + ├── [DS~0] sd1_ds0/ + └── [DS~1] (not installed) sd1_subds0/ """ }, ] diff --git a/datalad_next/tree.py b/datalad_next/tree.py index f0976c85..307971f8 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -617,8 +617,8 @@ class _TreeNode: # symbols for the tip of the 'tree branch', depending on # whether a node is the last in it subtree or not - PREFIX_MIDDLE_CHILD = "├── " - PREFIX_LAST_CHILD = "└── " + PREFIX_MIDDLE_CHILD = "├──" + PREFIX_LAST_CHILD = "└──" # symbol for representing the continuation of a 'tree branch' INDENTATION_SYMBOL = "│" @@ -656,12 +656,11 @@ def __str__(self): if self.COLOR is not None: path = ansi_colors.color_word(path, self.COLOR) - prefix = "" if self.depth > 0: prefix = self.PREFIX_LAST_CHILD if self.is_last_child \ else self.PREFIX_MIDDLE_CHILD - - return prefix + str(path) + return " ".join([prefix, path]) + return str(path) # root directory has no prefix @staticmethod def stats_description(count): @@ -732,10 +731,21 @@ def __init__(self, *args, **kwargs): self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() def __str__(self): - install_flag = ", not installed" if not self.is_installed else "" - dir_suffix = "/" if self.depth > 0 else "" - ds_marker = f" [DS~{self.ds_absolute_depth}{install_flag}]" - return super().__str__() + dir_suffix + ds_marker + default_str = super().__str__() + + ds_marker_depth = ansi_colors.color_word( + f"DS~{self.ds_absolute_depth}", ansi_colors.WHITE) + install_flag = " (not installed)" if not self.is_installed else "" + ds_marker = f"[{ds_marker_depth}]{install_flag}" + + if self.depth > 0: + prefix = self.PREFIX_LAST_CHILD if self.is_last_child else \ + self.PREFIX_MIDDLE_CHILD + custom_str = default_str.replace(prefix, f"{prefix} {ds_marker}") + else: + custom_str = f"{ds_marker} {default_str}" + + return custom_str + ("/" if self.depth > 0 else "") @staticmethod def stats_description(count): From 3d1247498b0583a1bef3c50b8dd99985be109a70 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 29 Jul 2022 01:40:39 +0200 Subject: [PATCH 053/131] use common call to Tree/DatasetTree constructor --- datalad_next/tree.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 307971f8..f686573c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -135,20 +135,21 @@ def __call__( include_hidden=False): if dataset_depth is not None: - tree = DatasetTree( - Path(path), - max_depth=depth, - max_dataset_depth=dataset_depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files) - ) + # special tree defined by subdataset nesting depth + tree_cls = DatasetTree + dataset_tree_args = {"max_dataset_depth": dataset_depth} else: - tree = Tree( - Path(path), - max_depth=depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files) - ) + # simple tree defined by directory depth + tree_cls = Tree + dataset_tree_args = {} + + tree = tree_cls( + Path(path), + max_depth=depth, + exclude_node_func=build_excluded_node_func( + include_hidden=include_hidden, include_files=include_files), + **dataset_tree_args + ) for line in tree.print_line(): # print one line at a time to improve UX / perceived speed @@ -516,9 +517,10 @@ def generate_nodes(self): Yield ``_TreeNode`` objects that belong to the tree. A ``DatasetTree`` is just an unlimited-depth ``Tree`` with more - complex rules for excluding nodes. Each exclusion rule is encoded - in a function. The rules are then combined in a final - ``exclusion_func`` which is supplied to the ``Tree`` constructor. + complex rules for pruning (skipping traversal of particular nodes). + Each exclusion rule is encoded in a function. The rules are then + combined in a final ``exclusion_func`` which is supplied to the + ``Tree`` constructor. Returns ------- From 7a74655d14f4f8ee89685682d35e33493b5d39a7 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 29 Jul 2022 22:22:06 +0200 Subject: [PATCH 054/131] use custom result renderers --- datalad_next/tree.py | 63 ++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index f686573c..3473a4e2 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -27,7 +27,7 @@ get_status_dict, ) from datalad.interface.utils import ( - eval_results, + eval_results, generic_result_renderer, ) from datalad.support.constraints import ( EnsureNone, @@ -151,17 +151,29 @@ def __call__( **dataset_tree_args ) - for line in tree.print_line(): - # print one line at a time to improve UX / perceived speed - print(line) - print("\n" + tree.stats() + "\n") + for node, line in tree.generate_nodes_with_str(): + # yield one node at a time to improve UX / perceived speed + yield get_status_dict( + action="tree", + status="ok", + path=node.path, + type=node.TYPE, + depth=node.depth, + node_str=line, + tree_stats=tree.stats() + ) - # return a generic OK status - yield get_status_dict( - action='tree', - status='ok', - path=path, - ) + @staticmethod + def custom_result_renderer(res, **kwargs): + print(res["node_str"]) + + @staticmethod + def custom_result_summary_renderer(res, **kwargs): + # print the summary 'report line' with count of nodes by type + print("\n" + res[-1]["tree_stats"] + "\n") + # print "ok" status for input path (root node) + root_node = res[0] + generic_result_renderer(root_node) def build_excluded_node_func(include_hidden=False, include_files=False): @@ -429,7 +441,10 @@ def to_string(self) -> str: return "\n".join(self._lines) def print_line(self): - """Generator for tree string output lines + """Generator for tree string output lines. + + When yielding, also stores the output in self._lines to avoid having + to recompute it. Returns ------- @@ -437,7 +452,7 @@ def print_line(self): """ if not self._lines: # string output has not been generated yet - for line in self._yield_lines(): + for _, line in self.generate_nodes_with_str(): self._lines.append(line) yield line else: @@ -446,12 +461,10 @@ def print_line(self): yield line yield "\n" # newline at the very end - def _yield_lines(self): - """Generator of lines of the tree string representation. - - Each line represents a tree node (directory or dataset or file). + def generate_nodes_with_str(self): + """Generator of tree nodes and their string representation. - A line follows the structure: + Each node is printed on one line. The string uses the format: ``[] [] `` Example line: @@ -459,11 +472,11 @@ def _yield_lines(self): Returns ------- - Generator[str] + Generator[Tuple[_TreeNode, str]] """ - # keep track of levels where subtree is exhaused, i.e. we have - # reached the last child of the subtree. + # keep track of levels where subtree is exhausted, i.e. we have + # reached the last child of the current subtree. # this is needed to build the indentation string for each node, # which takes into account whether any parent is the last node of # its own subtree. @@ -491,7 +504,7 @@ def _yield_lines(self): indentation = "".join(indentation_symbols_for_levels) line = indentation + str(node) - yield line + yield node, line class DatasetTree(Tree): @@ -573,6 +586,7 @@ def child_datasets(): node.ds_depth, _ = node.calculate_dataset_depth() yield node + # stop at the first matching datasets return any( ds.ds_depth <= self.max_dataset_depth for ds in child_datasets() @@ -615,6 +629,7 @@ class _TreeNode: """Base class for a directory or file represented as a single tree node and printed as single line of the 'tree' output. """ + TYPE = None # needed for command result dict COLOR = None # ANSI color for the path, if terminal color are enabled # symbols for the tip of the 'tree branch', depending on @@ -697,6 +712,7 @@ def parents(self): class DirectoryNode(_TreeNode): + TYPE = "directory" COLOR = ansi_colors.BLUE def __init__(self, *args, **kwargs): @@ -714,6 +730,8 @@ def stats_description(count): class FileNode(_TreeNode): + TYPE = "file" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -723,6 +741,7 @@ def stats_description(count): class DatasetNode(_TreeNode): + TYPE = "dataset" COLOR = ansi_colors.MAGENTA def __init__(self, *args, **kwargs): From 5730693f2381dbb26bd78b99e106a28c829175cc Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 30 Jul 2022 19:17:22 +0200 Subject: [PATCH 055/131] improve search algorithm: cache results of git operations, use 1 fixed parallel 'helper' tree instead of exponentially many redundant subtrees --- datalad_next/tree.py | 350 ++++++++++++++++++++++++++++++------------- 1 file changed, 249 insertions(+), 101 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 3473a4e2..87c63e1c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -21,7 +21,7 @@ from datalad.support.param import Parameter from datalad.distribution.dataset import ( datasetmethod, - require_dataset, + require_dataset, Dataset, ) from datalad.interface.results import ( get_status_dict, @@ -34,6 +34,7 @@ EnsureStr, EnsureInt, EnsureRange, ) from datalad.support import ansi_colors +from datalad.utils import get_dataset_root lgr = logging.getLogger('datalad.local.tree') @@ -222,6 +223,23 @@ def _wrapper(*args, **kwargs): return _wrapper +def yield_with_last_item(generator): + """Takes a generator and yields for each item, the item itself and + whether it is the last item in the sequence. + + Returns + ------- + Tuple[bool, Any] + A tuple (is_last_item, item) + """ + prev_val = next(generator, None) + if prev_val is not None: + for current_val in generator: + yield False, prev_val + prev_val = current_val + yield True, prev_val + + @lru_cache def is_dataset(path: Path): """Fast dataset detection. @@ -243,14 +261,11 @@ def is_dataset(path: Path): path: Path Path to directory to be identified as dataset or non-dataset """ - ds = require_dataset(path, check_installed=False) - ds_path = Path(ds.path) - # detect if it is an installed datalad-proper dataset # (as opposed to git/git-annex repo). # could also query `ds.id`, but checking just for existence # of config file is quicker. - if Path(ds_path / ".datalad" / "config").is_file(): + if Path(path / ".datalad" / "config").is_file(): return True # if it is not installed, check if it has an installed superdataset. @@ -258,36 +273,112 @@ def is_dataset(path: Path): # directory has the .git folder), we check if the directory # is empty (faster) -- as e.g. after a non-recursive `datalad clone` def is_empty_dir(): - return not any(ds_path.iterdir()) + return not any(path.iterdir()) if is_empty_dir(): - if get_superdataset(ds) is not None: + if get_superdataset(path) is not None: return True return False @lru_cache -def get_superdataset(ds): - """Wrapper for ``Dataset.get_superdataset()`` with predefined options. +def get_subds_paths(ds_path: Path): + """Return paths of immediate subdatasets for a given dataset path. + + This is an expensive operation because it calls git to read the + submodules. Since we need to run this to (A) calculate dataset depth and + (B) detect non-installed datasets, we cache results, so that the list of + subdatasets is computed only once for each parent dataset. + """ + def res_filter(res): + return res.get('status') == 'ok' and res.get('type') == 'dataset' + + return Dataset(ds_path).subdatasets( + recursive=False, + result_filter=res_filter, + on_failure='ignore', + result_xfm='paths', + result_renderer='disabled', + return_type='list' + ) - Results are cached, as this function may be rerun on the same dataset - multiple times. + +def get_dataset_root_datalad_only(path: Path): + """Get root of dataset containing a given path (datalad datasets only, + not pure git/git-annex repo) Parameters ---------- - ds: Dataset + path: Path + Path to file or directory + + Returns + ------- + Path + """ + ds_root = path + while ds_root: + potential_ds_root = get_dataset_root(str(ds_root)) + + if potential_ds_root is None: + return None # we are not inside a dataset + + potential_ds_root = Path(potential_ds_root) + if is_dataset(potential_ds_root): + return potential_ds_root # it's a match + + # we go one directory higher and try again + ds_root = Path.resolve(potential_ds_root / '..') + return ds_root + + +@lru_cache +def get_superdataset(path: Path): + """Reimplementation of ``Dataset.get_superdataset()`` to allow caching + results of `ds.subdatasets()` (the most expensive operation). + + Parameters + ---------- + path: Path + Path to a dataset Returns ------- Dataset or None """ - return ds.get_superdataset( - datalad_only=True, topmost=False, registered_only=True) + path = str(path) + superds_path = None + + while path: + # normalize the path after adding .. so we guaranteed to not + # follow into original directory if path itself is a symlink + parent_path = Path.resolve(Path(path) / '..') + sds_path_ = get_dataset_root_datalad_only(parent_path) + if sds_path_ is None: + # no more parents, use previous found + break + + superds = Dataset(sds_path_) + + # test if path is registered subdataset of the parent + if not any(is_path_relative_to(Path(p), Path(path)) + for p in get_subds_paths(Path(superds.path))): + break + + # That was a good candidate + superds_path = sds_path_ + path = str(parent_path) + break + + if superds_path is None: + # None was found + return None + return Dataset(superds_path) def is_path_relative_to(my_path: Path, other_path: Path): - """Copy of pathlib's ``Path.is_relative_to()`` that requires python3.9+""" + """Port of pathlib's ``Path.is_relative_to()`` that requires python3.9+""" try: my_path.relative_to(other_path) return True @@ -378,31 +469,34 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): Whether the directory ``dir_path`` is the last child of its parent in the ordered list of child nodes """ - if not self.skip_root or \ self.skip_root and self.path_depth(dir_path) > 0: yield DirectoryOrDatasetNode( dir_path, self.path_depth(dir_path), is_last_child ) - # check that we are within max_depth levels (None means unlimited - # depth) + # check that we are within max_depth levels + # (None means unlimited depth) if self.max_depth is None or \ self.path_depth(dir_path) < self.max_depth: - # apply exclusion filter - selected_children = ( - p for p in dir_path.iterdir() + # sort child nodes alphabetically + # needs to be done *before* calling the exclusion function, + # because the function may depend on sort order + all_children = sorted(list(dir_path.iterdir())) + + # apply exclusion filters + children = ( + p for p in all_children if not self.exclude_node_func(p) ) - # sort directory contents alphabetically - children = sorted(list(selected_children)) - for child in children: - - # check if node is the last child within its subtree (needed - # for displaying special end-of-subtree prefix) - is_last_child = (child == children[-1]) + # exclusion function could be expensive to compute, so we generate + # child nodes, but we need to be able to detect the last child + # within its subtree (needed for displaying special + # end-of-subtree prefix). so we wrap the generator in another + # generator to detect the last item. + for is_last_child, child in yield_with_last_item(children): if child.is_dir(): # recurse into subdirectories @@ -524,6 +618,15 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): self.max_dataset_depth = max_dataset_depth + # generator that will traverse the whole tree (once) and yield + # only datasets and their parents directories, hand-in-hand with the + # main node generator + self._ds_generator = self._generate_datasets() + # current value of the generator. it will be initialized lazily, + # so for now we set it to the a `_TreeNode` with dummy depth just + # distinguish it from None (None means the generator has finished). + self._next_ds = _TreeNode(self.root, -1, False) + @increment_node_count def generate_nodes(self): """ @@ -540,89 +643,134 @@ def generate_nodes(self): Generator[_TreeNode] """ - def is_git_folder(path: Path): - return path.is_dir() and path.name == ".git" + def exclude_func(path: Path): + """Exclusion function -- here is the crux of the logic for + pruning the dataset tree.""" - def ds_exceeds_max_ds_depth(path: Path): - if path.is_dir() and is_dataset(path): - ds = DatasetNode(path, self.path_depth(path), False) - return ds.ds_depth > self.max_dataset_depth - return False - - def ds_child_node_exceeds_max_depth(path: Path): - if not (path.is_dir() and is_dataset(path)): - ds_parents = [ - p for p in path.parents - if is_path_relative_to(p, self.root) and - is_dataset(p) - ] - if ds_parents: - parent = ds_parents[-1] # closest parent - relative_depth = self.path_depth(path) - self.path_depth(parent) - return relative_depth > self.max_depth - return True + # initialize dataset(-parent) generator if not done yet + if self._next_ds is not None and \ + self._next_ds.depth == -1: # dummy depth + self._advance_ds_generator() - def is_parent_of_included_ds(path: Path): - def exclude(p: Path): - return not p.is_dir() or is_git_folder(p) - - if path.is_dir() and not is_dataset(path): - # search in the subtree with the current path as root - subtree = Tree( - path, - max_depth=None, - exclude_node_func=exclude, - skip_root=True - ) + if path.is_dir() and is_dataset(path): + # check if maximum dataset depth is exceeded + is_valid_ds = self._is_valid_dataset(path) + if is_valid_ds: + self._advance_ds_generator() # go to next dataset(-parent) + return not is_valid_ds + + # exclude file or directory underneath a dataset, + # if it has depth (relative to dataset root) > max_depth, + # unless (in case of a directory) it is itself the parent of a + # valid dataset. if it's a parent of a dataset, we don't apply + # any filters -- it's just a means to get to the next dataset. + if not self._is_parent_of_ds(path): + return self.exclude_node_func(path) or \ + self._ds_child_node_exceeds_max_depth(path) + + return False # do not exclude + + tree = Tree( + self.root, + max_depth=None, # unlimited traversal (datasets could be anywhere) + exclude_node_func=exclude_func, + skip_root=self.skip_root, + ) - def child_datasets(): - """Generator of dataset nodes below the current node""" - for node in subtree.generate_nodes(): - if isinstance(node, DatasetNode): - # offset depth by depth of current path - node.depth += self.path_depth(path) - # need to recalculate dataset depth after - # updating directory depth - node.ds_depth, _ = node.calculate_dataset_depth() - yield node - - # stop at the first matching datasets - return any( - ds.ds_depth <= self.max_dataset_depth - for ds in child_datasets() - ) + yield from tree.generate_nodes() - return False + def _advance_ds_generator(self): + """Go to the next dataset or parent of dataset""" + self._next_ds = next(self._ds_generator, None) - def exclude_func(path: Path): - """Combine exclusion criteria from different functions""" - criteria = self.exclude_node_func(path) + def _generate_datasets(self): + """Generator of dataset nodes and their parent directories starting + from the tree root and up to ``max_dataset_depth`` levels. - if path.is_dir() and is_dataset(path): - # check if maximum dataset depth is exceeded - criteria |= ds_exceeds_max_ds_depth(path) - else: - # do not traverse the .git folder (we will not find - # datasets underneath it) - criteria |= is_git_folder(path) + This second tree will be generated in parallel with the main tree + with an offset, such that it always points to the next dataset (or + dataset parent) relative to the current node in the main tree. This + allows us to 'look into the future' to decide whether to prune the + current node or not, without having to spawn new subtree generators + for each node (which would re-traverse the same datasets over again). - # exclude files or directories underneath a dataset, - # if they have depth (relative to dataset root) > max_depth, - # unless they are themselves parents of a dataset with - # dataset depth within the valid ds_depth range - if not (path.is_dir() and is_parent_of_included_ds(path)): - criteria |= ds_child_node_exceeds_max_depth(path) + Returns + ------- + Generator[DirectoryNode or DatasetNode] + """ - return criteria + def exclude(p: Path): + # we won't find any datasets underneath the git folder + return not p.is_dir() or \ + (p.is_dir() and p.name == ".git") ds_tree = Tree( self.root, - max_depth=None, # unlimited traversal (datasets could be anywhere) - exclude_node_func=exclude_func, - skip_root=self.skip_root, + max_depth=None, + exclude_node_func=exclude, + skip_root=True, ) - yield from ds_tree.generate_nodes() + visited_parents = set([]) + + for node in ds_tree.generate_nodes(): + if isinstance(node, DatasetNode) and \ + node.ds_depth <= self.max_dataset_depth and \ + not self.exclude_node_func(node.path): + + # yield parent directories if not already done + for depth, parent in enumerate(node.parents): + if depth == 0 and ds_tree.skip_root: + continue + if parent not in visited_parents: + visited_parents.add(parent) + + yield DirectoryOrDatasetNode( + parent, + depth, + None # we don't care if it's the last child or not + ) + + visited_parents.add(node.path) + yield node + + def _is_valid_dataset(self, path: Path): + return path.is_dir() and \ + is_path_relative_to(path, self.root) and \ + is_dataset(path) and \ + not self.exclude_node_func(path) and \ + not self._ds_exceeds_max_ds_depth(path) + + def _ds_exceeds_max_ds_depth(self, path: Path): + ds = DatasetNode(path, self.path_depth(path), False) + return ds.ds_depth > self.max_dataset_depth + + def _ds_child_node_exceeds_max_depth(self, path: Path): + ds_parent = get_dataset_root_datalad_only(path) + if ds_parent is None: + return True # it's not a dataset child, we exclude it + + if not self._is_valid_dataset(ds_parent): + return True # also exclude + + # check directory depth relative to the dataset parent + rel_depth = self.path_depth(path) - self.path_depth(ds_parent) + assert rel_depth >= 0 # sanity check + return rel_depth > self.max_depth + + def _is_parent_of_ds(self, path: Path): + if not path.is_dir(): + return False # files can't be parents + + if self._next_ds is None: + return False # no more datasets, can't be a parent + + if self._next_ds.path == path: + # we hit a dataset or the parent of a dataset + self._advance_ds_generator() + return True + + return False class _TreeNode: @@ -695,8 +843,7 @@ def tree_root(self) -> Path: @property def parents(self): - """List of parent paths (beginning from the tree root) in top-down - order. + """List of parent paths in top-down order beginning from the tree root. Returns ------- @@ -772,6 +919,7 @@ def __str__(self): def stats_description(count): return str(count) + (" dataset" if int(count) == 1 else " datasets") + @lru_cache def calculate_dataset_depth(self): """ Calculate 2 measures of a dataset's nesting depth/level: @@ -792,7 +940,7 @@ def calculate_dataset_depth(self): ds = self.ds while ds: - superds = get_superdataset(ds) + superds = get_superdataset(ds.path) if superds is None: # it is not a dataset, do nothing From 054cb98144e075409cb5b38b949da0e47a118184 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 30 Jul 2022 19:26:17 +0200 Subject: [PATCH 056/131] update docstrings --- datalad_next/tree.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 87c63e1c..0c6f713f 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -618,13 +618,13 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): self.max_dataset_depth = max_dataset_depth - # generator that will traverse the whole tree (once) and yield - # only datasets and their parents directories, hand-in-hand with the - # main node generator + # secondary 'helper' generator that will traverse the whole tree + # (once) and yield only datasets and their parents directories self._ds_generator = self._generate_datasets() - # current value of the generator. it will be initialized lazily, - # so for now we set it to the a `_TreeNode` with dummy depth just - # distinguish it from None (None means the generator has finished). + # current value of the ds_generator. the generator will be initialized + # lazily, so for now we set the value to a dummy `_TreeNode` + # with an impossible depth just to distinguish it from None (None means + # the generator has finished). self._next_ds = _TreeNode(self.root, -1, False) @increment_node_count @@ -687,12 +687,15 @@ def _generate_datasets(self): """Generator of dataset nodes and their parent directories starting from the tree root and up to ``max_dataset_depth`` levels. - This second tree will be generated in parallel with the main tree - with an offset, such that it always points to the next dataset (or - dataset parent) relative to the current node in the main tree. This - allows us to 'look into the future' to decide whether to prune the - current node or not, without having to spawn new subtree generators - for each node (which would re-traverse the same datasets over again). + This second 'helper' tree will be generated in parallel with the main + tree but with an offset, such that it always points to the next + dataset (or dataset parent) relative to the current node in the main + tree. + + This allows us to 'look into the future' to decide whether to prune the + current node in the main tree or not, without having to spawn new + subtree generators for each node (which would re-traverse the same + nodes over again, with an exponential factor). Returns ------- @@ -775,8 +778,7 @@ def _is_parent_of_ds(self, path: Path): class _TreeNode: """Base class for a directory or file represented as a single tree node - and printed as single line of the 'tree' output. - """ + and printed as single line of the 'tree' output.""" TYPE = None # needed for command result dict COLOR = None # ANSI color for the path, if terminal color are enabled From 27450657e10e8b0fd8cc8e1beabf7f1ad10a810d Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 30 Jul 2022 19:49:41 +0200 Subject: [PATCH 057/131] reword docstrings --- datalad_next/tree.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 0c6f713f..357131a1 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -687,15 +687,13 @@ def _generate_datasets(self): """Generator of dataset nodes and their parent directories starting from the tree root and up to ``max_dataset_depth`` levels. - This second 'helper' tree will be generated in parallel with the main - tree but with an offset, such that it always points to the next - dataset (or dataset parent) relative to the current node in the main - tree. - - This allows us to 'look into the future' to decide whether to prune the - current node in the main tree or not, without having to spawn new - subtree generators for each node (which would re-traverse the same - nodes over again, with an exponential factor). + This secondary 'helper' tree will be generated in parallel with the + main tree but will be one step ahead, such that it always points to + the next dataset (or dataset parent) relative to the current node in + the main tree. + + We can use it to look into downstream/future nodes and decide + efficiently whether to prune the current node in the main tree. Returns ------- From c71a1b3a595c8c3d801ba621749c8dc29f30e12c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 1 Aug 2022 19:37:22 +0200 Subject: [PATCH 058/131] do not print generic render output (command with status 'ok') in custom renderer --- datalad_next/tree.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 357131a1..fc56c243 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -171,10 +171,7 @@ def custom_result_renderer(res, **kwargs): @staticmethod def custom_result_summary_renderer(res, **kwargs): # print the summary 'report line' with count of nodes by type - print("\n" + res[-1]["tree_stats"] + "\n") - # print "ok" status for input path (root node) - root_node = res[0] - generic_result_renderer(root_node) + print("\n" + res[-1]["tree_stats"]) def build_excluded_node_func(include_hidden=False, include_files=False): From ac99ee37040606d9a5ea9e3755d66e78624dabc7 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 1 Aug 2022 21:27:38 +0200 Subject: [PATCH 059/131] get subdatasets by calling command to avoid import of full dataset API --- datalad_next/tree.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index fc56c243..54729d0c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -29,6 +29,7 @@ from datalad.interface.utils import ( eval_results, generic_result_renderer, ) +from datalad.local.subdatasets import Subdatasets from datalad.support.constraints import ( EnsureNone, EnsureStr, EnsureInt, EnsureRange, @@ -291,7 +292,10 @@ def get_subds_paths(ds_path: Path): def res_filter(res): return res.get('status') == 'ok' and res.get('type') == 'dataset' - return Dataset(ds_path).subdatasets( + # call subdatasets command instead of dataset method `ds.subdatasets()` + # to avoid potentially expensive import of full datalad API + return Subdatasets.__call__( + dataset=ds_path, recursive=False, result_filter=res_filter, on_failure='ignore', From 80d01475055b630ce3c220dbe67a6dc00e77332a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 20:05:26 +0200 Subject: [PATCH 060/131] major refactor: move all tree2string logic to custom renderer in command class, call CLI command in tests --- datalad_next/tests/test_tree.py | 163 +++++++-------- datalad_next/tree.py | 355 +++++++++++++------------------- 2 files changed, 221 insertions(+), 297 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index ba2df050..ba0ac831 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -3,6 +3,7 @@ import pytest from datalad.distribution.dataset import Dataset +from datalad.cli.tests.test_main import run_main from datalad.tests.test_utils_testrepos import BasicGitTestRepo from datalad.tests.utils_pytest import ( assert_raises, @@ -11,7 +12,7 @@ ) from datalad.utils import rmtemp -from ..tree import Tree, DatasetTree, build_excluded_node_func +from ..tree import Tree """Tests for the ``datalad tree`` command.""" @@ -150,6 +151,37 @@ def path_ds(): assert not temp_dir_root.exists() +def get_tree_rendered_output(tree_cmd: list): + """ + Run 'tree' CLI command with the given list of arguments and + return the output of the custom results renderer, broken down into + 3 components (tree root, tree body, report line). + + Assumes command exit code 0 and no additional logging to stdout. + + Parameters + ---------- + tree_cmd: list(str) + 'tree' command given as list of strings + + Returns + ------- + tuple + 3-value tuple consisting of: tree root, tree body, report line + """ + # remove any empty strings from command + out, _ = run_main([c for c in tree_cmd if c != '']) + + # remove trailing newline + lines = out.rstrip("\n").split("\n") + + root = lines[0] # first line of tree output + body = "\n".join(lines[1:-1]) + report = lines[-1] + + return root, body, report + + @pytest.fixture(scope="class") def inject_path_no_ds(request, path_no_ds): """ @@ -386,16 +418,15 @@ class TestTreeWithoutDatasets(TestTree): def test_print_tree( self, depth, include_files, include_hidden, expected_str ): - root = Path(self.path) / "root" - tree = Tree( - root, max_depth=depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ), - skip_root=True # skip the first line with the root directory - ) - lines = tree.print_line() - actual_res = "\n".join(line for line in lines) + "\n" + root = str(self.path / "root") + command = [ + 'tree', + root, + '--depth', str(depth), + '--include-hidden' if include_hidden else '', + '--include-files' if include_files else '' + ] + _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline print("expected:") print(expected_res) @@ -406,14 +437,15 @@ def test_print_tree( def test_print_stats( self, depth, include_files, include_hidden, expected_stats_str ): - root = self.path / 'root' - tree = Tree( - root, max_depth=depth, - exclude_node_func=build_excluded_node_func( - include_hidden=include_hidden, include_files=include_files - ), - ).build() - actual_res = tree.stats() + root = str(self.path / 'root') + command = [ + 'tree', + root, + '--depth', str(depth), + '--include-hidden' if include_hidden else '', + '--include-files' if include_files else '' + ] + _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) @@ -425,41 +457,34 @@ def test_root_path_is_normalized(self, root_dir_name): Test that root path in the first line of string output is normalized path """ - root = self.path / root_dir_name - tree = Tree(root, max_depth=0) + root = str(self.path / root_dir_name) + command = ['tree', root, '--depth', '0'] + actual, _, _ = get_tree_rendered_output(command) expected = str(self.path / "root") - actual = next(tree.print_line()) # first line of tree output - assert_str_equal(expected, actual) - - def test_tree_to_string(self): - root = self.path / 'root' - tree = Tree(root, 3) - actual = tree.to_string() - expected = "\n".join(tree._lines) assert_str_equal(expected, actual) def test_print_tree_depth_zero(self): - root = self.path / "root" - tree = Tree( - root, - max_depth=0, - # including files should have no effect - exclude_node_func=build_excluded_node_func(include_files=True) - ) - actual = tree.to_string() - expected = str(root) + root = str(self.path / "root") + # including files should # have no effect + command = ['tree', root, '--depth', '0', '--include-files'] + actual, _, _ = get_tree_rendered_output(command) + expected = str(self.path / "root") assert_str_equal(expected, actual) @pytest.mark.usefixtures("inject_path_ds") -class TestTreeWithDatasets(TestTree): +class TestTreeWithDatasets(TestTreeWithoutDatasets): """Test directory tree with datasets""" __test__ = True + # set `include_files` and `include_hidden` to False, + # they should be already covered in `TestTreeWithoutDatasets` MATRIX = [ { "depth": 1, + "include_files": False, + "include_hidden": False, "expected_stats_str": "2 datasets, 1 directory, 0 files", "expected_str": """ ├── repo0/ @@ -469,6 +494,8 @@ class TestTreeWithDatasets(TestTree): }, { "depth": 4, + "include_files": False, + "include_hidden": False, "expected_stats_str": "7 datasets, 3 directories, 0 files", "expected_str": """ ├── repo0/ @@ -485,37 +512,6 @@ class TestTreeWithDatasets(TestTree): }, ] - params = { - "test_print_tree": [ - "depth", "expected_str" - ], - "test_print_stats": [ - "depth", "expected_stats_str" - ] - } - - def test_print_tree( - self, depth, expected_str - ): - root = self.path / "root" - tree = Tree( - root, max_depth=depth, - skip_root=True # skip the first line with the root directory - ) - lines = tree.print_line() - actual_res = "\n".join(l for l in lines) + "\n" - expected_res = expected_str.lstrip("\n") # strip first newline - assert_str_equal(expected_res, actual_res) - - def test_print_stats( - self, depth, expected_stats_str - ): - root = self.path / 'root' - tree = Tree(root, max_depth=depth).build() - actual_res = tree.stats() - expected_res = expected_stats_str - assert_str_equal(expected_res, actual_res) - @pytest.mark.usefixtures("inject_path_ds") class TestDatasetTree(TestTree): @@ -588,12 +584,14 @@ class TestDatasetTree(TestTree): def test_print_tree( self, dataset_depth, depth, expected_str ): - root = self.path / "root" - tree = DatasetTree( - root, max_depth=depth, max_dataset_depth=dataset_depth, - skip_root=True) - lines = tree.print_line() - actual_res = "\n".join(l for l in lines) + "\n" + root = str(self.path / "root") + command = [ + 'tree', + root, + '--depth', str(depth), + '--dataset-depth', str(dataset_depth) + ] + _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline print("expected:") print(expected_res) @@ -604,10 +602,13 @@ def test_print_tree( def test_print_stats( self, dataset_depth, depth, expected_stats_str ): - root = self.path / 'root' - tree = DatasetTree( - root, max_depth=depth, max_dataset_depth=dataset_depth - ).build() - actual_res = tree.stats() + root = str(self.path / "root") + command = [ + 'tree', + root, + '--depth', str(depth), + '--dataset-depth', str(dataset_depth) + ] + _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 54729d0c..eb7985dc 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -26,15 +26,13 @@ from datalad.interface.results import ( get_status_dict, ) -from datalad.interface.utils import ( - eval_results, generic_result_renderer, -) +from datalad.interface.utils import eval_results + from datalad.local.subdatasets import Subdatasets from datalad.support.constraints import ( EnsureNone, EnsureStr, EnsureInt, EnsureRange, ) -from datalad.support import ansi_colors from datalad.utils import get_dataset_root lgr = logging.getLogger('datalad.local.tree') @@ -153,26 +151,112 @@ def __call__( **dataset_tree_args ) - for node, line in tree.generate_nodes_with_str(): + for node in tree.generate_nodes(): # yield one node at a time to improve UX / perceived speed yield get_status_dict( action="tree", status="ok", - path=node.path, + path=str(node.path), type=node.TYPE, depth=node.depth, - node_str=line, - tree_stats=tree.stats() + exhausted_levels=tree.exhausted_levels, + count={ + "datasets": tree.node_count["DatasetNode"], + "directories": tree.node_count["DirectoryNode"], + "files": tree.node_count["FileNode"] + }, + **{ + "dataset_depth": node.ds_depth, + "dataset_abs_depth": node.ds_absolute_depth, + "dataset_is_installed": node.is_installed + } if node.TYPE == "dataset" else {}, ) @staticmethod def custom_result_renderer(res, **kwargs): - print(res["node_str"]) + """ + Each node is printed on one line. The string uses the format: + ``[] [] [ 0: + indentation_symbols_for_levels = [ + ("│" + if level not in exhausted_levels + else " ") + " " + for level in range(1, depth) + ] + indentation = "".join(indentation_symbols_for_levels) + + # build prefix (tree branch tip) + prefix = "" + if depth > 0: # root node has no prefix + is_last_child = depth in exhausted_levels + prefix = "└──" if is_last_child else "├──" + + # build dataset marker if dataset + ds_marker = "" + if node_type == "dataset": + ds_absolute_depth = res["dataset_abs_depth"] + ds_is_installed = res["dataset_is_installed"] + + ds_marker_depth = ansi_colors.color_word( + f"DS~{ds_absolute_depth}", + ansi_colors.WHITE) + install_flag = " (not installed)" if not ds_is_installed else "" + ds_marker = f"[{ds_marker_depth}]" + install_flag + + # build path string with optional color + # display only root directory with full path, all other nodes + # with basename + path = node_path if depth == 0 else Path(node_path).name + color_for_type = { + "dataset": ansi_colors.MAGENTA, + "directory": ansi_colors.BLUE, + "file": None + } + # ANSI color for the path, if terminal colors are enabled + color = color_for_type[node_type] + if color is not None: + path = ansi_colors.color_word(path, color) + + # set suffix for directories + dir_suffix = "" + if depth > 0 and node_type in ("directory", "dataset"): + dir_suffix = "/" + + line = indentation + \ + " ".join((s for s in (prefix, ds_marker, path) if s != "")) + \ + dir_suffix + print(line) @staticmethod def custom_result_summary_renderer(res, **kwargs): - # print the summary 'report line' with count of nodes by type - print("\n" + res[-1]["tree_stats"]) + """Print the summary 'report line' with count of nodes by type""" + + c_ds = res[-1]['count']['datasets'] + c_dirs = res[-1]['count']['directories'] + c_files = res[-1]['count']['files'] + + descriptions = [ + f"{c_ds} " + ("dataset" if int(c_ds) == 1 else "datasets"), + f"{c_dirs} " + ("directory" if int(c_dirs) == 1 else "directories"), + f"{c_files} " + ("file" if int(c_files) == 1 else "files") + ] + + print("\n" + ", ".join(descriptions)) def build_excluded_node_func(include_hidden=False, include_files=False): @@ -209,12 +293,12 @@ def _wrapper(*args, **kwargs): self = args[0] # 'self' is a Tree instance for node in node_generator_func(*args, **kwargs): node_type = node.__class__.__name__ - if node_type not in self._stats: + if node_type not in self.node_count: raise ValueError( - f"No stats collected for unknown node type '{node_type}'" + f"No counts collected for unknown node type '{node_type}'" ) if node.depth > 0: # we do not count the root directory - self._stats[node_type] += 1 + self.node_count[node_type] += 1 yield node # yield what the generator yielded @@ -305,6 +389,13 @@ def res_filter(res): ) +def is_subds_of_parent(subds_path: Path, parent_path: Path): + return any( + is_path_relative_to(Path(p), subds_path) + for p in get_subds_paths(parent_path) + ) + + def get_dataset_root_datalad_only(path: Path): """Get root of dataset containing a given path (datalad datasets only, not pure git/git-annex repo) @@ -363,8 +454,7 @@ def get_superdataset(path: Path): superds = Dataset(sds_path_) # test if path is registered subdataset of the parent - if not any(is_path_relative_to(Path(p), Path(path)) - for p in get_subds_paths(Path(superds.path))): + if not is_subds_of_parent(Path(path), Path(superds.path)): break # That was a good candidate @@ -420,12 +510,19 @@ def __init__(self, # set custom or default filter criteria self.exclude_node_func = exclude_node_func or self.default_exclude_func - # store list of lines of output string - self._lines = [] + # keep track of levels where the subtree is exhausted, + # i.e. we have reached the last child of the current subtree. + # this is needed for the custom results renderer, to display + # nodes differently based on their relative position in the tree. + self.exhausted_levels = set([]) - # store dict with count of nodes for each _TreeNode subtype - self._stats = {node_type.__name__: 0 - for node_type in _TreeNode.__subclasses__()} + # store dict with count of nodes for each node type, similar to the + # tree command's 'report line' at the end of the output. + # the node types (subclasses of ``_TreeNode``) are mutually exclusive, + # so the sum of their counts equals to the total node count. + # does not count the root itself, only the contents below the root. + self.node_count = {node_type.__name__: 0 + for node_type in _TreeNode.__subclasses__()} @staticmethod def default_exclude_func(path: Path): @@ -437,28 +534,7 @@ def path_depth(self, path: Path) -> int: the tree""" return len(path.relative_to(self.root).parts) - def stats(self) -> str: - """ - Produces a string with counts of different node types, similar - to the tree command's 'report line' at the end of the tree - output. - - The node types (subclasses of ``_TreeNode``) are mutually exclusive, - so the sum of their counts equals to the total node count. - - Does not count the root itself, only the contents below the root. - """ - # sort node type names alphabetically - node_types = sorted( - _TreeNode.__subclasses__(), - key=lambda c: c.__name__ - ) - return ", ".join( - node_type.stats_description(self._stats[node_type.__name__]) - for node_type in node_types - ) - - def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): + def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from ``dir_path`` @@ -466,15 +542,10 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): ---------- dir_path: Path Directory from which to calculate the tree - is_last_child: bool - Whether the directory ``dir_path`` is the last child of its - parent in the ordered list of child nodes """ if not self.skip_root or \ self.skip_root and self.path_depth(dir_path) > 0: - yield DirectoryOrDatasetNode( - dir_path, self.path_depth(dir_path), is_last_child - ) + yield DirectoryOrDatasetNode(dir_path, self.path_depth(dir_path)) # check that we are within max_depth levels # (None means unlimited depth) @@ -492,18 +563,24 @@ def _generate_tree_nodes(self, dir_path: Path, is_last_child=True): if not self.exclude_node_func(p) ) - # exclusion function could be expensive to compute, so we generate - # child nodes, but we need to be able to detect the last child - # within its subtree (needed for displaying special - # end-of-subtree prefix). so we wrap the generator in another - # generator to detect the last item. + # exclusion function could be expensive to compute, so we + # use a generator for child nodes. however, we need to be able + # to detect the last child node within each subtree (needed for + # displaying special end-of-subtree prefix). so we wrap the + # generator in another 'lookahead' generator to detect the last + # item. for is_last_child, child in yield_with_last_item(children): + if is_last_child: # last child of its subtree + self.exhausted_levels.add(self.path_depth(child)) + else: + self.exhausted_levels.discard(self.path_depth(child)) + if child.is_dir(): # recurse into subdirectories - yield from self._generate_tree_nodes(child, is_last_child) + yield from self._generate_tree_nodes(child) else: - yield FileNode(child, self.path_depth(child), is_last_child) + yield FileNode(child, self.path_depth(child)) @increment_node_count def generate_nodes(self): @@ -523,84 +600,6 @@ def generate_nodes(self): # underlying generator. yield from self._generate_tree_nodes(self.root) - def build(self): - """Construct the tree string representation (will be stored in - instance attribute) and return the instance.""" - self.to_string() - return self - - def to_string(self) -> str: - """Return complete tree as string""" - if not self._lines: - return "\n".join(list(self.print_line())) - return "\n".join(self._lines) - - def print_line(self): - """Generator for tree string output lines. - - When yielding, also stores the output in self._lines to avoid having - to recompute it. - - Returns - ------- - Generator[str] - """ - if not self._lines: - # string output has not been generated yet - for _, line in self.generate_nodes_with_str(): - self._lines.append(line) - yield line - else: - # string output is already generated - for line in self._lines: - yield line - yield "\n" # newline at the very end - - def generate_nodes_with_str(self): - """Generator of tree nodes and their string representation. - - Each node is printed on one line. The string uses the format: - ``[] [] `` - - Example line: - ``│ │ ├── path_dir_level3`` - - Returns - ------- - Generator[Tuple[_TreeNode, str]] - """ - - # keep track of levels where subtree is exhausted, i.e. we have - # reached the last child of the current subtree. - # this is needed to build the indentation string for each node, - # which takes into account whether any parent is the last node of - # its own subtree. - levels_with_exhausted_subtree = set([]) - - for node in self.generate_nodes(): - - if node.is_last_child: # last child of its subtree - levels_with_exhausted_subtree.add(node.depth) - else: - # 'discard' does not raise exception if value does not exist - # in set - levels_with_exhausted_subtree.discard(node.depth) - - # build indentation string - indentation = "" - spacing = node.INDENTATION_SPACING - if node.depth > 0: - indentation_symbols_for_levels = [ - (node.INDENTATION_SYMBOL - if level not in levels_with_exhausted_subtree - else " ") + spacing - for level in range(1, node.depth) - ] - indentation = "".join(indentation_symbols_for_levels) - - line = indentation + str(node) - yield node, line - class DatasetTree(Tree): """ @@ -626,7 +625,7 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # lazily, so for now we set the value to a dummy `_TreeNode` # with an impossible depth just to distinguish it from None (None means # the generator has finished). - self._next_ds = _TreeNode(self.root, -1, False) + self._next_ds = _TreeNode(self.root, -1) @increment_node_count def generate_nodes(self): @@ -677,6 +676,8 @@ def exclude_func(path: Path): exclude_node_func=exclude_func, skip_root=self.skip_root, ) + # synchronize exhausted levels with the main tree + self.exhausted_levels = tree.exhausted_levels yield from tree.generate_nodes() @@ -727,11 +728,7 @@ def exclude(p: Path): if parent not in visited_parents: visited_parents.add(parent) - yield DirectoryOrDatasetNode( - parent, - depth, - None # we don't care if it's the last child or not - ) + yield DirectoryOrDatasetNode(parent, depth) visited_parents.add(node.path) yield node @@ -744,7 +741,7 @@ def _is_valid_dataset(self, path: Path): not self._ds_exceeds_max_ds_depth(path) def _ds_exceeds_max_ds_depth(self, path: Path): - ds = DatasetNode(path, self.path_depth(path), False) + ds = DatasetNode(path, self.path_depth(path)) return ds.ds_depth > self.max_dataset_depth def _ds_child_node_exceeds_max_depth(self, path: Path): @@ -779,19 +776,8 @@ class _TreeNode: """Base class for a directory or file represented as a single tree node and printed as single line of the 'tree' output.""" TYPE = None # needed for command result dict - COLOR = None # ANSI color for the path, if terminal color are enabled - # symbols for the tip of the 'tree branch', depending on - # whether a node is the last in it subtree or not - PREFIX_MIDDLE_CHILD = "├──" - PREFIX_LAST_CHILD = "└──" - - # symbol for representing the continuation of a 'tree branch' - INDENTATION_SYMBOL = "│" - # spacing between the indentation symbol of one level and the next - INDENTATION_SPACING = " " - - def __init__(self, path: Path, depth: int, is_last_child: bool): + def __init__(self, path: Path, depth: int): """ Parameters ---------- @@ -799,12 +785,9 @@ def __init__(self, path: Path, depth: int, is_last_child: bool): Path of the tree node depth: int Directory depth of the node within its tree - is_last_child: bool - Whether the node is the last node among its parent's children """ self.path = path self.depth = depth - self.is_last_child = is_last_child def __eq__(self, other): return self.path == other.path @@ -812,29 +795,6 @@ def __eq__(self, other): def __hash__(self): return hash(str(self.path)) - def __str__(self): - # display root directory with full path, all other nodes with basename - if self.depth == 0: - path = self.path - else: - path = self.path.name - - if self.COLOR is not None: - path = ansi_colors.color_word(path, self.COLOR) - - if self.depth > 0: - prefix = self.PREFIX_LAST_CHILD if self.is_last_child \ - else self.PREFIX_MIDDLE_CHILD - return " ".join([prefix, path]) - return str(path) # root directory has no prefix - - @staticmethod - def stats_description(count): - """String describing the node count that will be included in the - tree's report line""" - # should be implemented by subclasses - raise NotImplementedError - @property def tree_root(self) -> Path: """Calculate tree root path from node path and depth""" @@ -861,21 +821,10 @@ def parents(self): class DirectoryNode(_TreeNode): TYPE = "directory" - COLOR = ansi_colors.BLUE def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def __str__(self): - string = super().__str__() - if self.depth > 0: - return string + "/" - return string - - @staticmethod - def stats_description(count): - return str(count) + (" directory" if int(count) == 1 else " directories") - class FileNode(_TreeNode): TYPE = "file" @@ -883,14 +832,9 @@ class FileNode(_TreeNode): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - @staticmethod - def stats_description(count): - return str(count) + (" file" if int(count) == 1 else " files") - class DatasetNode(_TreeNode): TYPE = "dataset" - COLOR = ansi_colors.MAGENTA def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -899,27 +843,6 @@ def __init__(self, *args, **kwargs): self.is_installed = self.ds.is_installed() self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() - def __str__(self): - default_str = super().__str__() - - ds_marker_depth = ansi_colors.color_word( - f"DS~{self.ds_absolute_depth}", ansi_colors.WHITE) - install_flag = " (not installed)" if not self.is_installed else "" - ds_marker = f"[{ds_marker_depth}]{install_flag}" - - if self.depth > 0: - prefix = self.PREFIX_LAST_CHILD if self.is_last_child else \ - self.PREFIX_MIDDLE_CHILD - custom_str = default_str.replace(prefix, f"{prefix} {ds_marker}") - else: - custom_str = f"{ds_marker} {default_str}" - - return custom_str + ("/" if self.depth > 0 else "") - - @staticmethod - def stats_description(count): - return str(count) + (" dataset" if int(count) == 1 else " datasets") - @lru_cache def calculate_dataset_depth(self): """ From 0c232f031392510166cb008bfa616011b89d55dd Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 20:06:32 +0200 Subject: [PATCH 061/131] explicitly specify state 'any' for subdatasets --- datalad_next/tree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index eb7985dc..0307be82 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -381,6 +381,7 @@ def res_filter(res): return Subdatasets.__call__( dataset=ds_path, recursive=False, + state='any', # include not-installed subdatasets result_filter=res_filter, on_failure='ignore', result_xfm='paths', From 21b912588daa9d8b8cfff6424493656e802445a6 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 20:06:47 +0200 Subject: [PATCH 062/131] minor rewordings of comments --- datalad_next/tree.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 0307be82..889aab54 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -366,13 +366,12 @@ def is_empty_dir(): @lru_cache def get_subds_paths(ds_path: Path): - """Return paths of immediate subdatasets for a given dataset path. + """Return paths of immediate subdatasets for a given dataset path.""" + # This is an expensive operation because it calls git to read the + # submodules. Since we need to run it to (A) calculate dataset depth and + # (B) detect non-installed datasets, we cache results, so that the list of + # subdatasets is computed only once for each parent dataset. - This is an expensive operation because it calls git to read the - submodules. Since we need to run this to (A) calculate dataset depth and - (B) detect non-installed datasets, we cache results, so that the list of - subdatasets is computed only once for each parent dataset. - """ def res_filter(res): return res.get('status') == 'ok' and res.get('type') == 'dataset' @@ -470,7 +469,7 @@ def get_superdataset(path: Path): def is_path_relative_to(my_path: Path, other_path: Path): - """Port of pathlib's ``Path.is_relative_to()`` that requires python3.9+""" + """Port of pathlib's ``Path.is_relative_to()`` (requires python3.9+)""" try: my_path.relative_to(other_path) return True @@ -533,6 +532,7 @@ def default_exclude_func(path: Path): def path_depth(self, path: Path) -> int: """Calculate directory depth of a given path relative to the root of the tree""" + # TODO: error handling return len(path.relative_to(self.root).parts) def _generate_tree_nodes(self, dir_path: Path): @@ -646,7 +646,7 @@ def generate_nodes(self): def exclude_func(path: Path): """Exclusion function -- here is the crux of the logic for - pruning the dataset tree.""" + pruning the main tree.""" # initialize dataset(-parent) generator if not done yet if self._next_ds is not None and \ From 045a8131c824c6e70293c8db1d8a72967669bfe7 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 20:12:41 +0200 Subject: [PATCH 063/131] only show file count in stats line if --include-files option is given --- datalad_next/tests/test_tree.py | 20 ++++++++++---------- datalad_next/tree.py | 13 +++++++++---- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index ba0ac831..da5c797d 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -289,7 +289,7 @@ class TestTreeWithoutDatasets(TestTree): "depth": 1, "include_files": False, "include_hidden": False, - "expected_stats_str": "0 datasets, 3 directories, 0 files", + "expected_stats_str": "0 datasets, 3 directories", "expected_str": """ ├── dir0/ ├── dir1/ @@ -300,7 +300,7 @@ class TestTreeWithoutDatasets(TestTree): "depth": 3, "include_files": False, "include_hidden": False, - "expected_stats_str": "0 datasets, 6 directories, 0 files", + "expected_stats_str": "0 datasets, 6 directories", "expected_str": """ ├── dir0/ ├── dir1/ @@ -390,7 +390,7 @@ class TestTreeWithoutDatasets(TestTree): "depth": 1, "include_files": False, "include_hidden": True, - "expected_stats_str": "0 datasets, 4 directories, 0 files", + "expected_stats_str": "0 datasets, 4 directories", "expected_str": """ ├── .dir3/ ├── dir0/ @@ -402,7 +402,7 @@ class TestTreeWithoutDatasets(TestTree): "depth": 3, "include_files": False, "include_hidden": True, - "expected_stats_str": "0 datasets, 7 directories, 0 files", + "expected_stats_str": "0 datasets, 7 directories", "expected_str": """ ├── .dir3/ ├── dir0/ @@ -485,7 +485,7 @@ class TestTreeWithDatasets(TestTreeWithoutDatasets): "depth": 1, "include_files": False, "include_hidden": False, - "expected_stats_str": "2 datasets, 1 directory, 0 files", + "expected_stats_str": "2 datasets, 1 directory", "expected_str": """ ├── repo0/ ├── [DS~0] superds0/ @@ -496,7 +496,7 @@ class TestTreeWithDatasets(TestTreeWithoutDatasets): "depth": 4, "include_files": False, "include_hidden": False, - "expected_stats_str": "7 datasets, 3 directories, 0 files", + "expected_stats_str": "7 datasets, 3 directories", "expected_str": """ ├── repo0/ ├── [DS~0] superds0/ @@ -523,7 +523,7 @@ class TestDatasetTree(TestTree): { "dataset_depth": 0, "depth": 0, - "expected_stats_str": "3 datasets, 0 directories, 0 files", + "expected_stats_str": "3 datasets, 0 directories", "expected_str": """ ├── [DS~0] superds0/ └── [DS~0] superds1/ @@ -533,7 +533,7 @@ class TestDatasetTree(TestTree): { "dataset_depth": 0, "depth": 1, - "expected_stats_str": "3 datasets, 1 directory, 0 files", + "expected_stats_str": "3 datasets, 1 directory", "expected_str": """ ├── [DS~0] superds0/ └── [DS~0] superds1/ @@ -544,7 +544,7 @@ class TestDatasetTree(TestTree): { "dataset_depth": 1, "depth": 0, - "expected_stats_str": "6 datasets, 1 directory, 0 files", + "expected_stats_str": "6 datasets, 1 directory", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ @@ -558,7 +558,7 @@ class TestDatasetTree(TestTree): { "dataset_depth": 1, "depth": 2, - "expected_stats_str": "6 datasets, 2 directories, 0 files", + "expected_stats_str": "6 datasets, 2 directories", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 889aab54..e21d6696 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -163,7 +163,8 @@ def __call__( count={ "datasets": tree.node_count["DatasetNode"], "directories": tree.node_count["DirectoryNode"], - "files": tree.node_count["FileNode"] + **({"files": tree.node_count["FileNode"]} + if include_files else {}) }, **{ "dataset_depth": node.ds_depth, @@ -248,13 +249,17 @@ def custom_result_summary_renderer(res, **kwargs): c_ds = res[-1]['count']['datasets'] c_dirs = res[-1]['count']['directories'] - c_files = res[-1]['count']['files'] + # files may not be included in results (if not using command + # option '--include-files') + c_files = res[-1]['count'].get('files') descriptions = [ f"{c_ds} " + ("dataset" if int(c_ds) == 1 else "datasets"), - f"{c_dirs} " + ("directory" if int(c_dirs) == 1 else "directories"), - f"{c_files} " + ("file" if int(c_files) == 1 else "files") + f"{c_dirs} " + ("directory" if int(c_dirs) == 1 else "directories") ] + if c_files is not None: + descriptions.append( + f"{c_files} " + ("file" if int(c_files) == 1 else "files")) print("\n" + ", ".join(descriptions)) From 793ecaa46f1a62d0939c48a8cedc5b243f5c464f Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 20:15:11 +0200 Subject: [PATCH 064/131] use ui.message() instead of print() in result renderers --- datalad_next/tree.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index e21d6696..d68f253d 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -34,6 +34,7 @@ EnsureStr, EnsureInt, EnsureRange, ) from datalad.utils import get_dataset_root +from datalad.ui import ui lgr = logging.getLogger('datalad.local.tree') @@ -241,7 +242,7 @@ def custom_result_renderer(res, **kwargs): line = indentation + \ " ".join((s for s in (prefix, ds_marker, path) if s != "")) + \ dir_suffix - print(line) + ui.message(line) @staticmethod def custom_result_summary_renderer(res, **kwargs): @@ -261,7 +262,7 @@ def custom_result_summary_renderer(res, **kwargs): descriptions.append( f"{c_files} " + ("file" if int(c_files) == 1 else "files")) - print("\n" + ", ".join(descriptions)) + ui.message("\n" + ", ".join(descriptions)) def build_excluded_node_func(include_hidden=False, include_files=False): From bab4830ac170f9240b397ddc7a2732f13a22fa19 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 21:31:56 +0200 Subject: [PATCH 065/131] remove unneeded Tree attribute 'skip_root' --- datalad_next/tree.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d68f253d..903fa164 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -489,7 +489,6 @@ class Tree: def __init__(self, root: Path, max_depth=None, - skip_root=False, exclude_node_func=None): """ Parameters @@ -498,8 +497,6 @@ def __init__(self, Directory to be used as tree root max_depth: int or None Maximum directory depth for traversing the tree - skip_root: bool - If true, will not print the first line with tree root exclude_node_func: Callable or None Function to filter out tree nodes from the tree """ @@ -511,8 +508,6 @@ def __init__(self, if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - self.skip_root = skip_root - # set custom or default filter criteria self.exclude_node_func = exclude_node_func or self.default_exclude_func @@ -550,9 +545,8 @@ def _generate_tree_nodes(self, dir_path: Path): dir_path: Path Directory from which to calculate the tree """ - if not self.skip_root or \ - self.skip_root and self.path_depth(dir_path) > 0: - yield DirectoryOrDatasetNode(dir_path, self.path_depth(dir_path)) + # yield current node + yield DirectoryOrDatasetNode(dir_path, self.path_depth(dir_path)) # check that we are within max_depth levels # (None means unlimited depth) @@ -681,7 +675,6 @@ def exclude_func(path: Path): self.root, max_depth=None, # unlimited traversal (datasets could be anywhere) exclude_node_func=exclude_func, - skip_root=self.skip_root, ) # synchronize exhausted levels with the main tree self.exhausted_levels = tree.exhausted_levels @@ -718,7 +711,6 @@ def exclude(p: Path): self.root, max_depth=None, exclude_node_func=exclude, - skip_root=True, ) visited_parents = set([]) @@ -730,7 +722,7 @@ def exclude(p: Path): # yield parent directories if not already done for depth, parent in enumerate(node.parents): - if depth == 0 and ds_tree.skip_root: + if depth == 0: continue if parent not in visited_parents: visited_parents.add(parent) From 5f2f2d53baeacaf5092d20ad1797c369eb1dd4ff Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 23:21:39 +0200 Subject: [PATCH 066/131] start ds_generator from node below the root node --- datalad_next/tree.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 903fa164..18b39526 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -715,7 +715,12 @@ def exclude(p: Path): visited_parents = set([]) - for node in ds_tree.generate_nodes(): + nodes_below_root = ds_tree.generate_nodes() + next(nodes_below_root) # skip root node + + for node in nodes_below_root: + # for each dataset node, yield its parents first, then + # yield the dataset itself if isinstance(node, DatasetNode) and \ node.ds_depth <= self.max_dataset_depth and \ not self.exclude_node_func(node.path): From f66e30f62dc0d501c038257c5e01bacf19dbd023 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 23:22:54 +0200 Subject: [PATCH 067/131] rename 'visited_parents' to 'visited' since we store all yielded nodes --- datalad_next/tree.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 18b39526..d98e4b5a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -687,7 +687,7 @@ def _advance_ds_generator(self): def _generate_datasets(self): """Generator of dataset nodes and their parent directories starting - from the tree root and up to ``max_dataset_depth`` levels. + from below the tree root and up to ``max_dataset_depth`` levels. This secondary 'helper' tree will be generated in parallel with the main tree but will be one step ahead, such that it always points to @@ -713,7 +713,8 @@ def exclude(p: Path): exclude_node_func=exclude, ) - visited_parents = set([]) + # keep track of node paths that have already been yielded + visited = set([]) nodes_below_root = ds_tree.generate_nodes() next(nodes_below_root) # skip root node @@ -726,15 +727,14 @@ def exclude(p: Path): not self.exclude_node_func(node.path): # yield parent directories if not already done - for depth, parent in enumerate(node.parents): - if depth == 0: - continue - if parent not in visited_parents: - visited_parents.add(parent) + parents_below_root = node.parents[1:] # first parent is root + for depth, parent in enumerate(parents_below_root): + if parent not in visited: + visited.add(parent) yield DirectoryOrDatasetNode(parent, depth) - visited_parents.add(node.path) + visited.add(node.path) yield node def _is_valid_dataset(self, path: Path): From 6004d51320d7d406d66f90e76db784d3d25e31af Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 23:39:52 +0200 Subject: [PATCH 068/131] cast exhausted_subtrees set to list in results dict for easier conversion in JSON renderers --- datalad_next/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d98e4b5a..a30dc1ea 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -160,7 +160,7 @@ def __call__( path=str(node.path), type=node.TYPE, depth=node.depth, - exhausted_levels=tree.exhausted_levels, + exhausted_levels=list(tree.exhausted_levels), count={ "datasets": tree.node_count["DatasetNode"], "directories": tree.node_count["DirectoryNode"], From 9d074fd0dcda12096e767d1f8f25e3d80bcdf142 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Thu, 4 Aug 2022 23:40:54 +0200 Subject: [PATCH 069/131] remove redundant constructors for FileNode and DirectoryNode, which just inherit from super --- datalad_next/tree.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index a30dc1ea..0ffebfbb 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -826,16 +826,10 @@ def parents(self): class DirectoryNode(_TreeNode): TYPE = "directory" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - class FileNode(_TreeNode): TYPE = "file" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - class DatasetNode(_TreeNode): TYPE = "dataset" From f7ee92550e9ded7288ed5419554d6024d58a42f9 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 5 Aug 2022 00:49:46 +0200 Subject: [PATCH 070/131] replace print with ui.message in tests as well (fixes encoding error on windows) --- datalad_next/tests/test_tree.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index da5c797d..14702e6f 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -11,6 +11,7 @@ with_tree ) from datalad.utils import rmtemp +from datalad.ui import ui from ..tree import Tree @@ -428,10 +429,10 @@ def test_print_tree( ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline - print("expected:") - print(expected_res) - print("actual:") - print(actual_res) + ui.message("expected:") + ui.message(expected_res) + ui.message("actual:") + ui.message(actual_res) assert_str_equal(expected_res, actual_res) def test_print_stats( @@ -593,10 +594,10 @@ def test_print_tree( ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline - print("expected:") - print(expected_res) - print("actual:") - print(actual_res) + ui.message("expected:") + ui.message(expected_res) + ui.message("actual:") + ui.message(actual_res) assert_str_equal(expected_res, actual_res) def test_print_stats( From d3c8243bed5991eb8c874ce81205fcc7b0bd1248 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 5 Aug 2022 10:35:26 +0200 Subject: [PATCH 071/131] get dataset's pathobj property instead of re-instantiating Path object --- datalad_next/tree.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 0ffebfbb..795234e4 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -353,7 +353,7 @@ def is_dataset(path: Path): # (as opposed to git/git-annex repo). # could also query `ds.id`, but checking just for existence # of config file is quicker. - if Path(path / ".datalad" / "config").is_file(): + if (path / ".datalad" / "config").is_file(): return True # if it is not installed, check if it has an installed superdataset. @@ -460,7 +460,7 @@ def get_superdataset(path: Path): superds = Dataset(sds_path_) # test if path is registered subdataset of the parent - if not is_subds_of_parent(Path(path), Path(superds.path)): + if not is_subds_of_parent(Path(path), superds.pathobj): break # That was a good candidate @@ -612,6 +612,7 @@ class DatasetTree(Tree): """ def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) + # by default, do not recurse into datasets' subdirectories (other # than paths to nested subdatasets) if self.max_depth is None: @@ -622,6 +623,7 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # secondary 'helper' generator that will traverse the whole tree # (once) and yield only datasets and their parents directories self._ds_generator = self._generate_datasets() + # current value of the ds_generator. the generator will be initialized # lazily, so for now we set the value to a dummy `_TreeNode` # with an impossible depth just to distinguish it from None (None means @@ -873,7 +875,7 @@ def calculate_dataset_depth(self): break ds_absolute_depth += 1 - if is_path_relative_to(Path(superds.path), self.tree_root): + if is_path_relative_to(superds.pathobj, self.tree_root): # if the parent dataset is underneath the tree # root, we increment the relative depth ds_depth += 1 From d50653ac3c4f5bfe72433d9514b643b971ddd945 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 5 Aug 2022 10:46:21 +0200 Subject: [PATCH 072/131] use context manager 'make_tempfile' for deleting temp dir --- datalad_next/tests/test_tree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 14702e6f..2cb40d7a 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -10,7 +10,7 @@ assert_str_equal, with_tree ) -from datalad.utils import rmtemp +from datalad.utils import rmtemp, make_tempfile from datalad.ui import ui from ..tree import Tree @@ -255,10 +255,10 @@ def pytest_generate_tests(metafunc): def test_print_tree_fails_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" - dir_name = f"to_be_deleted_{datetime.now().timestamp()}" - nonexistent_dir = Path(with_tree({dir_name: []})(lambda f: f)()) + with make_tempfile(mkdir=True) as nonexistent_dir: + pass # do nothing, just wait for it to be deleted with assert_raises(ValueError): - Tree(nonexistent_dir, max_depth=1) + Tree(Path(nonexistent_dir), max_depth=1) class TestTree: From 54f4f87cae81f074a50308187633ba238f76cf7a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 6 Aug 2022 22:12:42 +0200 Subject: [PATCH 073/131] clean up imports, improve docstring wording --- datalad_next/tree.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 795234e4..88cf5e77 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -18,6 +18,10 @@ Interface, build_doc, ) +from datalad.support.exceptions import ( + CapturedException, + NoDatasetFound +) from datalad.support.param import Parameter from datalad.distribution.dataset import ( datasetmethod, @@ -31,7 +35,9 @@ from datalad.local.subdatasets import Subdatasets from datalad.support.constraints import ( EnsureNone, - EnsureStr, EnsureInt, EnsureRange, + EnsureStr, + EnsureInt, + EnsureRange, ) from datalad.utils import get_dataset_root from datalad.ui import ui @@ -266,8 +272,8 @@ def custom_result_summary_renderer(res, **kwargs): def build_excluded_node_func(include_hidden=False, include_files=False): - """Return a function to exclude ``_TreeNode`` objects from the tree ( - prevents them from being yielded by the node generator). + """Return a function to exclude ``_TreeNode`` objects from the tree + (prevents them from being yielded by the node generator). Returns ------- @@ -502,13 +508,14 @@ def __init__(self, """ self.root = root.resolve() if not self.root.is_dir(): - raise ValueError(f"directory '{root}' not found") + raise ValueError(f"Directory not found: '{root}'") self.max_depth = max_depth if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") - # set custom or default filter criteria + # set callable to exclude nodes from the tree, meaning they + # will not be yielded by the node generator self.exclude_node_func = exclude_node_func or self.default_exclude_func # keep track of levels where the subtree is exhausted, @@ -527,7 +534,7 @@ def __init__(self, @staticmethod def default_exclude_func(path: Path): - """By default, only include non-hidden directories, no files""" + """By default, exclude files and hidden directories from the tree""" return any((not path.is_dir(), path.name.startswith("."))) def path_depth(self, path: Path) -> int: @@ -760,7 +767,8 @@ def _ds_child_node_exceeds_max_depth(self, path: Path): # check directory depth relative to the dataset parent rel_depth = self.path_depth(path) - self.path_depth(ds_parent) - assert rel_depth >= 0 # sanity check + assert rel_depth >= 0, "relative depth from parent cannot be < 0 " \ + f"(path: '{path}', parent: '{ds_parent}')" return rel_depth > self.max_depth def _is_parent_of_ds(self, path: Path): From b9caf11d8ef651d6c6726a6d6a6c8de3b2f6e2d2 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 6 Aug 2022 22:16:49 +0200 Subject: [PATCH 074/131] first error handling impl with checks for OSErrors and circular symlinks --- datalad_next/tests/test_tree.py | 76 ++++++++- datalad_next/tree.py | 265 ++++++++++++++++++++++++-------- 2 files changed, 270 insertions(+), 71 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 2cb40d7a..d64b06ae 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,4 +1,4 @@ -from datetime import datetime +from contextlib import contextmanager from pathlib import Path import pytest @@ -8,9 +8,13 @@ from datalad.tests.utils_pytest import ( assert_raises, assert_str_equal, - with_tree + with_tree, ok_exists, with_tempfile, assert_in, + get_deeply_nested_structure, skip_wo_symlink_capability +) +from datalad.utils import ( + rmtemp, + make_tempfile, ) -from datalad.utils import rmtemp, make_tempfile from datalad.ui import ui from ..tree import Tree @@ -20,6 +24,24 @@ # ============================ Helper functions =============================== +@contextmanager +def ensure_no_permissions(path: Path): + """Remove all permissions for given file/directory and restore the + original permissions at the end""" + + # modeled after 'datalad.utils.ensure_write_permission' + original_mode = path.stat().st_mode + try: + path.chmod(0o000) + yield + finally: + try: + path.chmod(original_mode) + except FileNotFoundError: + # ignore error if path was deleted in the context block + pass + + def create_temp_dir_tree(tree_dict: dict) -> Path: """ Create a temporary directory tree. @@ -152,7 +174,7 @@ def path_ds(): assert not temp_dir_root.exists() -def get_tree_rendered_output(tree_cmd: list): +def get_tree_rendered_output(tree_cmd: list, exit_code: int = 0): """ Run 'tree' CLI command with the given list of arguments and return the output of the custom results renderer, broken down into @@ -164,14 +186,16 @@ def get_tree_rendered_output(tree_cmd: list): ---------- tree_cmd: list(str) 'tree' command given as list of strings + exit_code: int + Expected exit code of command (default: 0) Returns ------- - tuple + Tuple[str, str, str] 3-value tuple consisting of: tree root, tree body, report line """ # remove any empty strings from command - out, _ = run_main([c for c in tree_cmd if c != '']) + out, _ = run_main([c for c in tree_cmd if c != ''], exit_code=exit_code) # remove trailing newline lines = out.rstrip("\n").split("\n") @@ -261,6 +285,35 @@ def test_print_tree_fails_for_nonexistent_directory(): Tree(Path(nonexistent_dir), max_depth=1) +@skip_wo_symlink_capability +@with_tempfile +def test_tree_with_circular_symlinks(path=None): + """Test that we do not follow symlinks that point to directories + underneath the tree root (or its parent), to avoid duplicate subtrees""" + ds = get_deeply_nested_structure(path) + root = ds.path + command = ["tree", "--depth", "2", root] + _, actual_res, _ = get_tree_rendered_output(command) + expected_res = """ +├── directory_untracked/ +│ └── link2dir/ +├── link2dir/ +├── link2subdsdir/ +├── link2subdsroot/ +├── subdir/ +└── [DS~1] subds_modified/ + ├── link2superdsdir/ + ├── subdir/ + └── [DS~2] subds_lvl1_modified/ +""".lstrip("\n") + + ui.message("expected:") + ui.message(expected_res) + ui.message("actual:") + ui.message(actual_res) + assert_str_equal(expected_res, actual_res) + + class TestTree: """Base class with tests that should run for all Tree configurations""" __test__ = False # tells pytest to not collect tests in this class @@ -450,6 +503,17 @@ def test_print_stats( expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) + def test_print_tree_permission_denied(self): + parent_dir = str(self.path) + with make_tempfile(mkdir=True, dir=parent_dir) as forbidden_dir: + # temporarily remove all permissions + with ensure_no_permissions(Path(forbidden_dir)): + # tree command should return error exit status but not crash + command = ['tree', parent_dir, '--depth', '2', '--include-files'] + _, actual, _ = get_tree_rendered_output(command, exit_code=1) + ui.message(actual) + assert_in("[error opening dir]", actual) + @pytest.mark.parametrize( "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] ) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 88cf5e77..61d3b3d5 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -160,25 +160,44 @@ def __call__( for node in tree.generate_nodes(): # yield one node at a time to improve UX / perceived speed - yield get_status_dict( - action="tree", - status="ok", - path=str(node.path), - type=node.TYPE, - depth=node.depth, - exhausted_levels=list(tree.exhausted_levels), - count={ + res_dict = { + "action": "tree", + "path": str(node.path), + "type": node.TYPE, + "depth": node.depth, + "exhausted_levels": list(tree.exhausted_levels), + "count": { "datasets": tree.node_count["DatasetNode"], "directories": tree.node_count["DirectoryNode"], **({"files": tree.node_count["FileNode"]} if include_files else {}) }, - **{ + } + if node.TYPE == "dataset": + res_dict.update({ "dataset_depth": node.ds_depth, "dataset_abs_depth": node.ds_absolute_depth, "dataset_is_installed": node.is_installed - } if node.TYPE == "dataset" else {}, - ) + }) + + if node.exception is not None: + # mimic error message of unix 'tree' command for + # permission denied error + message = "error opening dir" \ + if node.exception.name == "PermissionError" \ + else node.exception.message + + yield get_status_dict( + status="error", + message=message, + exception=node.exception, + **res_dict + ) + else: + yield get_status_dict( + status="ok", + **res_dict + ) @staticmethod def custom_result_renderer(res, **kwargs): @@ -245,9 +264,14 @@ def custom_result_renderer(res, **kwargs): if depth > 0 and node_type in ("directory", "dataset"): dir_suffix = "/" + # add short error message if there was exception + error_msg = "" + if "exception" in res: + error_msg = f" [{res['message']}]" + line = indentation + \ " ".join((s for s in (prefix, ds_marker, path) if s != "")) + \ - dir_suffix + dir_suffix + error_msg ui.message(line) @staticmethod @@ -334,9 +358,14 @@ def yield_with_last_item(generator): yield True, prev_val +def is_empty_dir(path: Path): + return path.is_dir() and not any(path.iterdir()) + + @lru_cache def is_dataset(path: Path): """Fast dataset detection. + Infer that a directory is a dataset if it is either: - installed, or @@ -344,8 +373,12 @@ def is_dataset(path: Path): Only consider datalad datasets, not plain git/git-annex repos. - Results are cached because the check is somewhat expensive and may be run - multiple times on the same path. + Symlinks pointing to datasets are not resolved, so will always return + False for symlinks. This prevents potentially detecting duplicate datasets + if the symlink and its target are both included in the tree. + + Results are cached because the check is somewhat expensive and may + be run multiple times on the same path. TODO: is there a way to detect a datalad dataset if it is not installed and it is not a subdataset? @@ -355,24 +388,29 @@ def is_dataset(path: Path): path: Path Path to directory to be identified as dataset or non-dataset """ - # detect if it is an installed datalad-proper dataset - # (as opposed to git/git-annex repo). - # could also query `ds.id`, but checking just for existence - # of config file is quicker. - if (path / ".datalad" / "config").is_file(): - return True - - # if it is not installed, check if it has an installed superdataset. - # instead of querying ds.is_installed() (which checks if the - # directory has the .git folder), we check if the directory - # is empty (faster) -- as e.g. after a non-recursive `datalad clone` - def is_empty_dir(): - return not any(path.iterdir()) + try: + if path.is_symlink(): + return False # ignore symlinks even if pointing to datasets - if is_empty_dir(): - if get_superdataset(path) is not None: + if (path / ".datalad" / "config").is_file(): + # could also query `ds.id`, but checking just for existence + # of config file is quicker. return True + # if it is not installed, check if it has an installed superdataset. + # instead of querying ds.is_installed() (which checks if the + # directory has the .git folder), we check if the directory + # is empty (faster) -- as e.g. after a non-recursive `datalad clone` + if is_empty_dir(path): + if get_superdataset(path) is not None: + return True + + except Exception as ex: + # if anything fails (e.g. permission denied), we raise exception + # instead of returning False + raise NoDatasetFound(f"Cannot determine if '{path.name}' is a " + f"dataset") from ex + return False @@ -537,11 +575,62 @@ def default_exclude_func(path: Path): """By default, exclude files and hidden directories from the tree""" return any((not path.is_dir(), path.name.startswith("."))) - def path_depth(self, path: Path) -> int: + def path_depth(self, path: Path): """Calculate directory depth of a given path relative to the root of - the tree""" - # TODO: error handling - return len(path.relative_to(self.root).parts) + the tree. + + Can also be a negative integer if the path is a parent of the + tree root. + + Parameters + ---------- + path: Path + + Returns + ------- + int + Number of levels of the given path *below* the tree root (positive + integer) or *above* the tree root (negative integer) + """ + if is_path_relative_to(path, self.root): + return len(path.relative_to(self.root).parts) + elif is_path_relative_to(self.root, path): + return - len(self.root.relative_to(path).parts) + else: + return None # dummy value + + def is_circular_symlink(self, dir_path: Path): + """Detect symlink pointing to a directory that could lead to + duplicate subtrees. + + The default behaviour is to follow symlinks. However, do not follow + symlinks to directories that are also located under the tree root + or any parent of the tree root. + + Otherwise, the same subtree could be yielded multiple times, + potentially in an infinite loop (e.g. if the symlink points to + its parent). + """ + if not dir_path.is_symlink(): + return False + + target_dir = dir_path.resolve() + + is_circular = False + if is_path_relative_to(target_dir, self.root): + # target dir is within `max_depth` levels under the current tree, + # so it will likely be yielded or has already been yielded (bar + # any exclusion filters) + is_circular = self.max_depth is None or \ + self.path_depth(target_dir) <= self.max_depth + + elif is_path_relative_to(self.root, target_dir): + # target dir is a parent of the tree root, so we may still get + # into a loop if we recurse more than `max_depth` levels + is_circular = self.max_depth is None or \ + - self.path_depth(target_dir) > self.max_depth + + return is_circular def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from @@ -560,10 +649,20 @@ def _generate_tree_nodes(self, dir_path: Path): if self.max_depth is None or \ self.path_depth(dir_path) < self.max_depth: - # sort child nodes alphabetically - # needs to be done *before* calling the exclusion function, - # because the function may depend on sort order - all_children = sorted(list(dir_path.iterdir())) + if self.is_circular_symlink(dir_path): + # if symlink points to directory, do not recurse into it + return + + try: + # sort child nodes alphabetically + # needs to be done *before* calling the exclusion function, + # because the function may depend on sort order + all_children = sorted(list(dir_path.iterdir())) + except OSError as ex: + # do not recurse into children. + # the error should have been already stored in the + # `exception` attribute of the current parent node. + return # apply exclusion filters children = ( @@ -657,26 +756,31 @@ def exclude_func(path: Path): """Exclusion function -- here is the crux of the logic for pruning the main tree.""" - # initialize dataset(-parent) generator if not done yet - if self._next_ds is not None and \ - self._next_ds.depth == -1: # dummy depth - self._advance_ds_generator() - - if path.is_dir() and is_dataset(path): - # check if maximum dataset depth is exceeded - is_valid_ds = self._is_valid_dataset(path) - if is_valid_ds: - self._advance_ds_generator() # go to next dataset(-parent) - return not is_valid_ds - - # exclude file or directory underneath a dataset, - # if it has depth (relative to dataset root) > max_depth, - # unless (in case of a directory) it is itself the parent of a - # valid dataset. if it's a parent of a dataset, we don't apply - # any filters -- it's just a means to get to the next dataset. - if not self._is_parent_of_ds(path): - return self.exclude_node_func(path) or \ - self._ds_child_node_exceeds_max_depth(path) + try: + # initialize dataset(-parent) generator if not done yet + if self._next_ds is not None and \ + self._next_ds.depth == -1: # dummy depth + self._advance_ds_generator() + + if path.is_dir() and is_dataset(path): + # check if maximum dataset depth is exceeded + is_valid_ds = self._is_valid_dataset(path) + if is_valid_ds: + self._advance_ds_generator() # go to next dataset(-parent) + return not is_valid_ds + + # exclude file or directory underneath a dataset, + # if it has depth (relative to dataset root) > max_depth, + # unless (in case of a directory) it is itself the parent of a + # valid dataset. if it's a parent of a dataset, we don't apply + # any filters -- it's just a means to get to the next dataset. + if not self._is_parent_of_ds(path): + return self.exclude_node_func(path) or \ + self._ds_child_node_exceeds_max_depth(path) + + except Exception as ex: + CapturedException(ex, level=10) # DEBUG level + return True # exclude by default return False # do not exclude @@ -791,7 +895,8 @@ class _TreeNode: and printed as single line of the 'tree' output.""" TYPE = None # needed for command result dict - def __init__(self, path: Path, depth: int): + def __init__(self, path: Path, depth: int, + exception: CapturedException = None): """ Parameters ---------- @@ -799,9 +904,13 @@ def __init__(self, path: Path, depth: int): Path of the tree node depth: int Directory depth of the node within its tree + exception: CapturedException + Exception that may have occurred at validation/creation """ self.path = path self.depth = depth + # TODO: should be error collection / list of exceptions? + self.exception = exception def __eq__(self, other): return self.path == other.path @@ -836,6 +945,19 @@ def parents(self): class DirectoryNode(_TreeNode): TYPE = "directory" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + try: + # get first child if exists. this is a check for whether + # we can (potentially) recurse into the directory or + # if there are any filesystem issues (permissions errors, etc) + any(self.path.iterdir()) + except OSError as ex: + # permission errors etc. are logged and stored as node + # attribute so they can be passed to results dict + self.exception = CapturedException(ex, level=10) # DEBUG level + class FileNode(_TreeNode): TYPE = "file" @@ -845,11 +967,18 @@ class DatasetNode(_TreeNode): TYPE = "dataset" def __init__(self, *args, **kwargs): + """Does not check if valid dataset. This needs to be done before + creating the instance.""" super().__init__(*args, **kwargs) - self.ds = require_dataset(self.path, check_installed=False) - self.is_installed = self.ds.is_installed() - self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() + try: + self.ds = require_dataset(self.path, check_installed=False) + self.is_installed = self.ds.is_installed() + self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() + except Exception as ex: + if self.exception is not None: + # only if exception has not already been passed to constructor + self.exception = CapturedException(ex, level=10) @lru_cache def calculate_dataset_depth(self): @@ -898,7 +1027,13 @@ class DirectoryOrDatasetNode: ``DatasetNode``, based on whether the path is a dataset or not. """ def __new__(cls, path, *args, **kwargs): - if is_dataset(path): - return DatasetNode(path, *args, **kwargs) - else: - return DirectoryNode(path, *args, **kwargs) + try: + is_ds = is_dataset(path) # could fail because of permissions etc. + except Exception as ex: + # if dataset detection has failed, we fall back to a + # `DirectoryNode` with the exception stored as attribute + ce = CapturedException(ex, level=10) + return DirectoryNode(path, *args, exception=ce, **kwargs) + + node_cls = DatasetNode if is_ds else DirectoryNode + return node_cls(path, *args, **kwargs) From e67be06643771007ae1436fd026ad18de7f484bc Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 6 Aug 2022 22:17:38 +0200 Subject: [PATCH 075/131] clean up imports formatting --- datalad_next/tests/test_tree.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index d64b06ae..745c6b12 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -6,10 +6,14 @@ from datalad.cli.tests.test_main import run_main from datalad.tests.test_utils_testrepos import BasicGitTestRepo from datalad.tests.utils_pytest import ( + assert_in, assert_raises, assert_str_equal, - with_tree, ok_exists, with_tempfile, assert_in, - get_deeply_nested_structure, skip_wo_symlink_capability + with_tree, + ok_exists, + with_tempfile, + get_deeply_nested_structure, + skip_wo_symlink_capability ) from datalad.utils import ( rmtemp, From a9e7d32b65936f2f1a25686201de902c76298bac Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 6 Aug 2022 22:18:01 +0200 Subject: [PATCH 076/131] assert that directory exists before deleting it --- datalad_next/tests/test_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 745c6b12..9c817741 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -284,7 +284,7 @@ def test_print_tree_fails_for_nonexistent_directory(): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" with make_tempfile(mkdir=True) as nonexistent_dir: - pass # do nothing, just wait for it to be deleted + ok_exists(nonexistent_dir) # just wait for it to be deleted with assert_raises(ValueError): Tree(Path(nonexistent_dir), max_depth=1) From 45b73f9b4a0631b758323f7b53726aba1189ef59 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 09:33:21 +0200 Subject: [PATCH 077/131] add commented option to compare test result with tree command output --- datalad_next/tests/test_tree.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 9c817741..05009415 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -317,6 +317,10 @@ def test_tree_with_circular_symlinks(path=None): ui.message(actual_res) assert_str_equal(expected_res, actual_res) + # Compare with output of 'tree' command + # import subprocess + # subprocess.run(["tree", "-dlL", "2", root]) + class TestTree: """Base class with tests that should run for all Tree configurations""" From dfb385696efa0a6649d304429718b6a9aeec289b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 09:35:11 +0200 Subject: [PATCH 078/131] raise exception if trying to calculate depth of path outside the tree root --- datalad_next/tree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 61d3b3d5..2914f38a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -597,7 +597,9 @@ def path_depth(self, path: Path): elif is_path_relative_to(self.root, path): return - len(self.root.relative_to(path).parts) else: - return None # dummy value + raise ValueError("Could not calculate directory depth: " + f"'{path}' is not relative to the tree root " + f"'{self.root}' (or vice-versa)") def is_circular_symlink(self, dir_path: Path): """Detect symlink pointing to a directory that could lead to From b9ca3efa3a0073b53b0da36e6e4223d439266185 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 09:37:19 +0200 Subject: [PATCH 079/131] return true for is_recursive_symlinks() if link points to itself --- datalad_next/tree.py | 75 +++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 2914f38a..2f01f254 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -601,38 +601,46 @@ def path_depth(self, path: Path): f"'{path}' is not relative to the tree root " f"'{self.root}' (or vice-versa)") - def is_circular_symlink(self, dir_path: Path): - """Detect symlink pointing to a directory that could lead to - duplicate subtrees. + def is_recursive_symlink(self, dir_path: Path): + """Detect symlink pointing to a directory within the same tree. - The default behaviour is to follow symlinks. However, do not follow - symlinks to directories that are also located under the tree root - or any parent of the tree root. + The default behaviour is to follow symlinks. However, we do not follow + symlinks to directories that we may visit or have visited already, + i.e. are also located under the tree root or any parent of + the tree root. - Otherwise, the same subtree could be yielded multiple times, - potentially in an infinite loop (e.g. if the symlink points to - its parent). + Otherwise, the same subtree could be generated multiple times in + different places, potentially in a recursive loop (e.g. if the + symlink points to its parent). + + This is similar to the logic of the UNIX 'tree' command, but goes a + step further to prune all duplicate subtrees. """ if not dir_path.is_symlink(): return False - target_dir = dir_path.resolve() - - is_circular = False - if is_path_relative_to(target_dir, self.root): - # target dir is within `max_depth` levels under the current tree, - # so it will likely be yielded or has already been yielded (bar - # any exclusion filters) - is_circular = self.max_depth is None or \ - self.path_depth(target_dir) <= self.max_depth - - elif is_path_relative_to(self.root, target_dir): - # target dir is a parent of the tree root, so we may still get - # into a loop if we recurse more than `max_depth` levels - is_circular = self.max_depth is None or \ - - self.path_depth(target_dir) > self.max_depth - - return is_circular + try: + # do not check if target actually exists, because if it doesn't, + # it will not be detected as directory, so we won't try to + # recurse into it anyway + target_dir = dir_path.resolve(strict=False) + except RuntimeError: + # RuntimeError means symlink points to itself, so it's all the + # more recursive + return True + else: + if is_path_relative_to(target_dir, self.root): + # target dir is within `max_depth` levels under the current + # tree, so it will likely be yielded or has already been + # yielded (bar any exclusion filters) + return self.max_depth is None or \ + self.path_depth(target_dir) <= self.max_depth + + elif is_path_relative_to(self.root, target_dir): + # target dir is a parent of the tree root, so we may still + # get into a loop if we recurse more than `max_depth` levels + return self.max_depth is None or \ + - self.path_depth(target_dir) > self.max_depth def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from @@ -651,8 +659,9 @@ def _generate_tree_nodes(self, dir_path: Path): if self.max_depth is None or \ self.path_depth(dir_path) < self.max_depth: - if self.is_circular_symlink(dir_path): - # if symlink points to directory, do not recurse into it + if self.is_recursive_symlink(dir_path): + # if symlink points to directory that we may visit or may + # have visited already, do not recurse into it return try: @@ -660,10 +669,11 @@ def _generate_tree_nodes(self, dir_path: Path): # needs to be done *before* calling the exclusion function, # because the function may depend on sort order all_children = sorted(list(dir_path.iterdir())) - except OSError as ex: + except OSError: # do not recurse into children. - # the error should have been already stored in the - # `exception` attribute of the current parent node. + # the error should have been already stored as + # `CapturedException` in the `exception` attribute of the + # current parent node on creation. return # apply exclusion filters @@ -957,7 +967,8 @@ def __init__(self, *args, **kwargs): any(self.path.iterdir()) except OSError as ex: # permission errors etc. are logged and stored as node - # attribute so they can be passed to results dict + # attribute so they can be passed to results dict. + # this will overwrite any exception passed to the constructor. self.exception = CapturedException(ex, level=10) # DEBUG level From 6a047fac31e4667a7648d16b00d9b2717c29fe68 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 10:55:15 +0200 Subject: [PATCH 080/131] detect dataset without ds.id if it has metadata aggregator --- datalad_next/tree.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 2f01f254..9f666476 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -371,7 +371,9 @@ def is_dataset(path: Path): - installed, or - not installed, but has an installed superdatset. - Only consider datalad datasets, not plain git/git-annex repos. + Only consider datalad datasets, not plain git/git-annex repos. Datasets + used for aggregating metadatata from subdatasets are also counted as + datasets, although they do not have a dataset ID themselves. Symlinks pointing to datasets are not resolved, so will always return False for symlinks. This prevents potentially detecting duplicate datasets @@ -392,7 +394,8 @@ def is_dataset(path: Path): if path.is_symlink(): return False # ignore symlinks even if pointing to datasets - if (path / ".datalad" / "config").is_file(): + if (path / ".datalad" / "config").is_file() or \ + (path / ".datalad" / "metadata").is_dir(): # could also query `ds.id`, but checking just for existence # of config file is quicker. return True From e101c85f358ca6cb16a5b54bd760d9770634480f Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 10:56:14 +0200 Subject: [PATCH 081/131] add debug logging --- datalad_next/tree.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 9f666476..7570388b 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -392,7 +392,11 @@ def is_dataset(path: Path): """ try: if path.is_symlink(): - return False # ignore symlinks even if pointing to datasets + # ignore symlinks even if pointing to datasets, otherwise we may + # get duplicate counts of datasets + lgr.debug("Path is a symlink, do not consider it a DatasetNode: " + f"'{path}'") + return False if (path / ".datalad" / "config").is_file() or \ (path / ".datalad" / "metadata").is_dir(): @@ -410,7 +414,8 @@ def is_dataset(path: Path): except Exception as ex: # if anything fails (e.g. permission denied), we raise exception - # instead of returning False + # instead of returning False. this can be caught and handled by the + # caller. raise NoDatasetFound(f"Cannot determine if '{path.name}' is a " f"dataset") from ex @@ -795,6 +800,9 @@ def exclude_func(path: Path): except Exception as ex: CapturedException(ex, level=10) # DEBUG level + lgr.debug(f"Excluding path '{path}' from tree because " + "an exception occurred while applying the " + "exclusion filter.") return True # exclude by default return False # do not exclude From 7e16f263e52d92e2b8c2c6d94018619066df9714 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 10:56:38 +0200 Subject: [PATCH 082/131] use None as dummy depth instead of -1 (allowed value) --- datalad_next/tree.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 7570388b..18e2f88d 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -752,9 +752,9 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # current value of the ds_generator. the generator will be initialized # lazily, so for now we set the value to a dummy `_TreeNode` - # with an impossible depth just to distinguish it from None (None means - # the generator has finished). - self._next_ds = _TreeNode(self.root, -1) + # with an impossible depth just to distinguish it from None + # (None means the generator has finished). + self._next_ds = _TreeNode(self.root, None) @increment_node_count def generate_nodes(self): @@ -779,7 +779,7 @@ def exclude_func(path: Path): try: # initialize dataset(-parent) generator if not done yet if self._next_ds is not None and \ - self._next_ds.depth == -1: # dummy depth + self._next_ds.depth is None: # dummy depth self._advance_ds_generator() if path.is_dir() and is_dataset(path): @@ -820,6 +820,11 @@ def exclude_func(path: Path): def _advance_ds_generator(self): """Go to the next dataset or parent of dataset""" self._next_ds = next(self._ds_generator, None) + if self._next_ds is not None: + lgr.debug( + f"Next dataset" + + (" parent" if isinstance(self._next_ds, DirectoryNode) else "") + + f": {self._next_ds.path}") def _generate_datasets(self): """Generator of dataset nodes and their parent directories starting From 1c6b9a35c1ac21f8b952c1a407c1ed3bce630450 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 20:18:00 +0200 Subject: [PATCH 083/131] improve documentation of TestTree base class --- datalad_next/tests/test_tree.py | 35 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 05009415..40935734 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -6,22 +6,27 @@ from datalad.cli.tests.test_main import run_main from datalad.tests.test_utils_testrepos import BasicGitTestRepo from datalad.tests.utils_pytest import ( - assert_in, assert_raises, assert_str_equal, with_tree, ok_exists, with_tempfile, get_deeply_nested_structure, - skip_wo_symlink_capability + skip_wo_symlink_capability, + ok_good_symlink, + ok_broken_symlink ) from datalad.utils import ( rmtemp, make_tempfile, + chpwd ) from datalad.ui import ui -from ..tree import Tree +from ..tree import ( + Tree, + TreeCommand +) """Tests for the ``datalad tree`` command.""" @@ -266,7 +271,9 @@ def pytest_generate_tests(metafunc): See: https://docs.pytest.org/en/7.1.x/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration """ - if metafunc.cls: + if metafunc.cls and \ + hasattr(metafunc.cls, 'params') and \ + hasattr(metafunc.cls, 'MATRIX'): test_id = metafunc.function.__name__ test_params_dict = metafunc.cls.params matrix = metafunc.cls.MATRIX @@ -320,14 +327,26 @@ def test_tree_with_circular_symlinks(path=None): # Compare with output of 'tree' command # import subprocess # subprocess.run(["tree", "-dlL", "2", root]) +class TestTree: + """Base class with tests that should run for multiple Tree + configurations. + Configurations are defined by: -class TestTree: - """Base class with tests that should run for all Tree configurations""" + - ``MATRIX``: dicts of pytest parameters and their values, where each dict + corresponds to a separate parametrized test instance. + - ``params``: a dict defining for each test method, which parameters + will be used in that test (from the parameter names contained in + ``MATRIX``). + """ __test__ = False # tells pytest to not collect tests in this class path = None # will be set by the inject_* fixture to temp dir tree root - # dict specifying multiple argument sets for a test method + # matrix of combinations of parameters to be tested and their + # expected results + MATRIX = [] + + # dict specifying parameter sets for each test method params = { "test_print_tree": [ "depth", "include_files", "include_hidden", "expected_str" @@ -344,8 +363,6 @@ class TestTreeWithoutDatasets(TestTree): __test__ = True - # matrix holds combinations of parameters to be tested - # and their expected results MATRIX = [ { "depth": 1, From 8f46400b4e90d15e74da2d32aec3fea58c3babb5 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 20:19:46 +0200 Subject: [PATCH 084/131] move tests for filesystem issues to separate test class, add test for broken symlinks --- datalad_next/tests/test_tree.py | 205 ++++++++++++++++++++++++-------- 1 file changed, 154 insertions(+), 51 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 40935734..53046b80 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -287,46 +287,6 @@ def pytest_generate_tests(metafunc): # ================================= Tests ===================================== -def test_print_tree_fails_for_nonexistent_directory(): - """Obtain nonexistent directory by creating a temp dir and deleting it - (may be safest method)""" - with make_tempfile(mkdir=True) as nonexistent_dir: - ok_exists(nonexistent_dir) # just wait for it to be deleted - with assert_raises(ValueError): - Tree(Path(nonexistent_dir), max_depth=1) - - -@skip_wo_symlink_capability -@with_tempfile -def test_tree_with_circular_symlinks(path=None): - """Test that we do not follow symlinks that point to directories - underneath the tree root (or its parent), to avoid duplicate subtrees""" - ds = get_deeply_nested_structure(path) - root = ds.path - command = ["tree", "--depth", "2", root] - _, actual_res, _ = get_tree_rendered_output(command) - expected_res = """ -├── directory_untracked/ -│ └── link2dir/ -├── link2dir/ -├── link2subdsdir/ -├── link2subdsroot/ -├── subdir/ -└── [DS~1] subds_modified/ - ├── link2superdsdir/ - ├── subdir/ - └── [DS~2] subds_lvl1_modified/ -""".lstrip("\n") - - ui.message("expected:") - ui.message(expected_res) - ui.message("actual:") - ui.message(actual_res) - assert_str_equal(expected_res, actual_res) - - # Compare with output of 'tree' command - # import subprocess - # subprocess.run(["tree", "-dlL", "2", root]) class TestTree: """Base class with tests that should run for multiple Tree configurations. @@ -528,17 +488,6 @@ def test_print_stats( expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) - def test_print_tree_permission_denied(self): - parent_dir = str(self.path) - with make_tempfile(mkdir=True, dir=parent_dir) as forbidden_dir: - # temporarily remove all permissions - with ensure_no_permissions(Path(forbidden_dir)): - # tree command should return error exit status but not crash - command = ['tree', parent_dir, '--depth', '2', '--include-files'] - _, actual, _ = get_tree_rendered_output(command, exit_code=1) - ui.message(actual) - assert_in("[error opening dir]", actual) - @pytest.mark.parametrize( "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] ) @@ -702,3 +651,157 @@ def test_print_stats( _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str assert_str_equal(expected_res, actual_res) + + +class TestTreeFilesystemIssues: + """Test tree with missing permissions, broken symlinks, etc.""" + + def test_print_tree_fails_for_nonexistent_directory(self): + """Obtain nonexistent directory by creating a temp dir and deleting it + (may be safest method)""" + with make_tempfile(mkdir=True) as nonexistent_dir: + ok_exists(nonexistent_dir) # just wait for it to be deleted + with assert_raises(ValueError): + Tree(Path(nonexistent_dir), max_depth=1) + + @with_tempfile + def test_print_tree_permission_denied(self, path=None): + """ + - If the tree contains a directory for which the user has no + permissions (so it would not be possible to traverse it), a message + should be displayed next to the affected directory path + - The rest of the tree following the forbidden directory should + be printed as usual + - The command should return error exit status but not crash + """ + (Path(path) / 'z_dir' / 'subdir').mkdir(parents=True) + forbidden_dir = Path(path) / 'a_forbidden_dir' + forbidden_dir.mkdir(parents=True) + # temporarily remove all permissions (octal 000) + # restore permissions at the end, otherwise we can't delete temp dir + with ensure_no_permissions(forbidden_dir): + command = ['tree', path, '--depth', '2'] + # expect exit code 1 + _, actual, _ = get_tree_rendered_output(command, exit_code=1) + expected = f""" +├── {forbidden_dir.name}/ [error opening dir] +└── z_dir/ + └── subdir/ +""".lstrip("\n") + ui.message("expected:") + ui.message(expected) + ui.message("actual:") + ui.message(actual) + assert_str_equal(expected, actual) + + @skip_wo_symlink_capability + @with_tempfile + def test_tree_with_broken_symlinks(self, path=None): + """Test that broken symlinks are reported as such""" + dpath = Path(path) + dir1 = dpath / 'real' / 'dir1' + file1 = dpath / 'real' / 'dir1' / 'file1' + dir1.mkdir(parents=True) + file1.touch() + + # create good symlinks + (dpath / 'links').mkdir() + # 1. symlink pointing to directory + link_to_dir1 = dpath / 'links' / 'link_to_dir1' + link_to_dir1.symlink_to(dir1, target_is_directory=True) + ok_good_symlink(link_to_dir1) + # 2. symlink pointing to file + link_to_file1 = dpath / 'links' / 'link_to_file1' + link_to_file1.symlink_to(file1) + ok_good_symlink(link_to_file1) + + # create bad symlinks + # 1. symlink pointing to non-existent target + link_to_nonexistent = dpath / 'links' / 'link_to_nonexistent' + link_to_nonexistent.symlink_to(dpath / 'nonexistent') + ok_broken_symlink(link_to_nonexistent) + + # 2. symlink pointing to itself + link_to_self = dpath / 'links' / 'link_to_self' + link_to_self.symlink_to(link_to_self) + with assert_raises(RuntimeError): + link_to_self.resolve() # fails because of infinite loop + + # test results dict using python API + actual = TreeCommand.__call__( + dpath, + include_files=True, + result_renderer="disabled", + result_xfm="paths", + result_filter=lambda res: res.get("is_broken_symlink", False) + ) + expected = [str(link_to_nonexistent), str(link_to_self)] + assert expected == actual + + @skip_wo_symlink_capability + @with_tempfile + def test_print_tree_with_recursive_symlinks(self, path=None): + """ + TODO: break down into separate tests + + - Symlinks targets are displayed in custom renderer output + - We do not follow symlinks that point to directories underneath + the tree root or its parent (to prevent duplicate subtrees) + - Symlinks pointing to datasets are not considered dataset nodes + themselves, but regular directories (to prevent duplicate counts + of datasets) + """ + parent = Path(path) + ds = get_deeply_nested_structure(str(parent / 'superds')) + + # change current dir to create symlinks with relative path + with chpwd(ds.path): + # create symlink to a sibling directory of the tree + # (should be recursed into) + (parent / 'ext_dir' / 'ext_subdir').mkdir(parents=True) + Path('link2extdir').symlink_to(Path('..') / 'ext_dir', + target_is_directory=True) + + # create symlink to grandparent of the tree root (should NOT + # be recursed into) + Path('link2parent').symlink_to(Path('..') / '..', + target_is_directory=True) + + # create symlink to subdir of the tree root at depth > max_depth + # (should be recursed into) + deepdir = Path('subds_modified') / 'subdir' / 'deepdir' + deepdir.mkdir() + (deepdir / 'subdeepdir').mkdir() + Path('link2deepdir').symlink_to(deepdir, target_is_directory=True) + + root = ds.path + command = ["tree", "--depth", "2", root] + _, actual_res, counts = get_tree_rendered_output(command) + expected_res = """ +├── directory_untracked/ +│ └── link2dir/ -> ../subdir +├── link2deepdir/ -> subds_modified/subdir/deepdir +│ └── subdeepdir/ +├── link2dir/ -> subdir +├── link2extdir/ -> ../ext_dir +│ └── ext_subdir/ +├── link2parent/ -> ../.. +├── link2subdsdir/ -> subds_modified/subdir +├── link2subdsroot/ -> subds_modified +├── subdir/ +└── [DS~1] subds_modified/ + ├── link2superdsdir/ -> ../subdir + ├── subdir/ + └── [DS~2] subds_lvl1_modified/ +""".lstrip("\n") + + # Compare with output of 'tree' command + # ui.message(counts) + # import subprocess + # subprocess.run(["tree", "-dlL", "2", root]) + + ui.message("expected:") + ui.message(expected_res) + ui.message("actual:") + ui.message(actual_res) + assert_str_equal(expected_res, actual_res) From ae1c6ad77531a84a7486be68aa1acbe254cfe6b0 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 20:20:42 +0200 Subject: [PATCH 085/131] add test for no difference if tree root is absolute or relative path --- datalad_next/tests/test_tree.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 53046b80..cf4fc670 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -502,6 +502,16 @@ def test_root_path_is_normalized(self, root_dir_name): expected = str(self.path / "root") assert_str_equal(expected, actual) + def test_no_difference_if_root_path_absolute_or_relative(self): + """Tree output should be identical whether the root directory + is given as absolute or relative path""" + root = str(self.path / "root") + output_abs_path = get_tree_rendered_output(['tree', root]) + with chpwd(root): + output_rel_path = get_tree_rendered_output(['tree', '.']) + + assert output_abs_path == output_rel_path + def test_print_tree_depth_zero(self): root = str(self.path / "root") # including files should # have no effect From ee162cd1fab554683a7d8d9a6f914a6dc59572e1 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 20:21:02 +0200 Subject: [PATCH 086/131] expand test for 0-depth tree --- datalad_next/tests/test_tree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index cf4fc670..efdf9b8c 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -514,11 +514,11 @@ def test_no_difference_if_root_path_absolute_or_relative(self): def test_print_tree_depth_zero(self): root = str(self.path / "root") - # including files should # have no effect + # including files should have no effect command = ['tree', root, '--depth', '0', '--include-files'] - actual, _, _ = get_tree_rendered_output(command) - expected = str(self.path / "root") - assert_str_equal(expected, actual) + actual = get_tree_rendered_output(command) + expected = (root, '', '0 datasets, 0 directories, 0 files') + assert expected == actual @pytest.mark.usefixtures("inject_path_ds") From 99b627dd74ed960691536988f1d3a63950e8eb34 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 20:59:21 +0200 Subject: [PATCH 087/131] fix logic for recursive symlink detection, add logging --- datalad_next/tree.py | 45 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 18e2f88d..777aba92 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -610,12 +610,13 @@ def path_depth(self, path: Path): f"'{self.root}' (or vice-versa)") def is_recursive_symlink(self, dir_path: Path): - """Detect symlink pointing to a directory within the same tree. + """Detect symlink pointing to a directory within the same tree + (directly or indirectly). The default behaviour is to follow symlinks. However, we do not follow symlinks to directories that we may visit or have visited already, i.e. are also located under the tree root or any parent of - the tree root. + the tree root (within a distance of ``max_depth``). Otherwise, the same subtree could be generated multiple times in different places, potentially in a recursive loop (e.g. if the @@ -627,28 +628,22 @@ def is_recursive_symlink(self, dir_path: Path): if not dir_path.is_symlink(): return False - try: - # do not check if target actually exists, because if it doesn't, - # it will not be detected as directory, so we won't try to - # recurse into it anyway - target_dir = dir_path.resolve(strict=False) - except RuntimeError: - # RuntimeError means symlink points to itself, so it's all the - # more recursive - return True - else: - if is_path_relative_to(target_dir, self.root): - # target dir is within `max_depth` levels under the current - # tree, so it will likely be yielded or has already been - # yielded (bar any exclusion filters) - return self.max_depth is None or \ - self.path_depth(target_dir) <= self.max_depth - - elif is_path_relative_to(self.root, target_dir): - # target dir is a parent of the tree root, so we may still - # get into a loop if we recurse more than `max_depth` levels - return self.max_depth is None or \ - - self.path_depth(target_dir) > self.max_depth + if not dir_path.is_dir(): + # we are only interested in symlinks pointing to a directory + raise ValueError("Path must be a directory") + + target_dir = dir_path.resolve() + + if is_path_relative_to(target_dir, self.root) or \ + is_path_relative_to(self.root, target_dir): + # either: + # - target dir is within `max_depth` levels beneath the tree + # root, so it will likely be yielded or has already been + # yielded (bar any exclusion filters) + # - target dir is a parent of the tree root, so we may still + # get into a loop if we recurse more than `max_depth` levels + return self.max_depth is None or \ + abs(self.path_depth(target_dir)) <= self.max_depth def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from @@ -670,6 +665,8 @@ def _generate_tree_nodes(self, dir_path: Path): if self.is_recursive_symlink(dir_path): # if symlink points to directory that we may visit or may # have visited already, do not recurse into it + lgr.debug(f"Symlink is potentially recursive, " + f"will not traverse target directory: '{dir_path}'") return try: From de16e02068ce565cfbf697ca86d1e7ce95233eef Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 21:00:40 +0200 Subject: [PATCH 088/131] support symlinks in result dict and custom renderer --- datalad_next/tree.py | 81 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 777aba92..3519fe7a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -12,6 +12,7 @@ import logging from functools import wraps, lru_cache +from os import readlink from pathlib import Path from datalad.interface.base import ( @@ -25,7 +26,8 @@ from datalad.support.param import Parameter from datalad.distribution.dataset import ( datasetmethod, - require_dataset, Dataset, + require_dataset, + Dataset, ) from datalad.interface.results import ( get_status_dict, @@ -180,9 +182,19 @@ def __call__( "dataset_is_installed": node.is_installed }) + if node.is_symlink(): + # TODO: should we inform if the symlink is recursive (as per + # `tree.is_recursive_symlink()`) although not broken? The + # UNIX 'tree' command shows the message '[recursive, + # not followed]' next to the path. Not sure if this is + # interesting at all or more confusing. + res_dict["symlink_target"] = node.get_symlink_target() + res_dict["is_broken_symlink"] = node.is_broken_symlink() + if node.exception is not None: # mimic error message of unix 'tree' command for - # permission denied error + # permission denied error, otherwise use exception short + # message message = "error opening dir" \ if node.exception.name == "PermissionError" \ else node.exception.message @@ -252,18 +264,27 @@ def custom_result_renderer(res, **kwargs): color_for_type = { "dataset": ansi_colors.MAGENTA, "directory": ansi_colors.BLUE, - "file": None + "file": None, + "broken_symlink": ansi_colors.RED } # ANSI color for the path, if terminal colors are enabled color = color_for_type[node_type] if color is not None: path = ansi_colors.color_word(path, color) + if res.get("is_broken_symlink", False): + path = ansi_colors.color_word(path, + color_for_type["broken_symlink"]) # set suffix for directories dir_suffix = "" if depth > 0 and node_type in ("directory", "dataset"): dir_suffix = "/" + # append symlink target if symlink + symlink_target = "" + if "symlink_target" in res: + symlink_target = " -> " + res["symlink_target"] + # add short error message if there was exception error_msg = "" if "exception" in res: @@ -271,7 +292,7 @@ def custom_result_renderer(res, **kwargs): line = indentation + \ " ".join((s for s in (prefix, ds_marker, path) if s != "")) + \ - dir_suffix + error_msg + dir_suffix + symlink_target + error_msg ui.message(line) @staticmethod @@ -333,7 +354,11 @@ def _wrapper(*args, **kwargs): raise ValueError( f"No counts collected for unknown node type '{node_type}'" ) - if node.depth > 0: # we do not count the root directory + if node.depth > 0: # do not count the root directory + # TODO: do not count symlinks if they point to + # files/directories that are already included in the tree + # (to prevent double counting)? Note that UNIX 'tree' does + # count double. self.node_count[node_type] += 1 yield node # yield what the generator yielded @@ -383,7 +408,7 @@ def is_dataset(path: Path): be run multiple times on the same path. TODO: is there a way to detect a datalad dataset if it is not installed - and it is not a subdataset? + and it is not a subdataset? Parameters ---------- @@ -394,8 +419,8 @@ def is_dataset(path: Path): if path.is_symlink(): # ignore symlinks even if pointing to datasets, otherwise we may # get duplicate counts of datasets - lgr.debug("Path is a symlink, do not consider it a DatasetNode: " - f"'{path}'") + lgr.debug("Path is a symlink, will not check if it points to a " + f"dataset: '{path}'") return False if (path / ".datalad" / "config").is_file() or \ @@ -966,6 +991,46 @@ def parents(self): return parents_from_tree_root[::-1] # top-down order + def is_symlink(self) -> bool: + """Check if node path is a symlink""" + try: + if self.path.is_symlink(): + return True + except Exception as ex: + # could fail because of permission issues etc. + # in which case we just default to False + self.exception = CapturedException(ex, level=10) + return False + + def is_broken_symlink(self) -> bool: + """If node path is a symlink, check if it points to a non-existing + target or to itself (self-referencing link)""" + try: + if self.is_symlink(): + self.path.resolve(strict=True) + return False + except FileNotFoundError: + return True + except RuntimeError: + # if symlink loop, consider it broken symlink + # (like UNIX 'tree' command does) + return True + except Exception as ex: + # probably broken in some way + self.exception = CapturedException(ex, level=10) + return True + + def get_symlink_target(self) -> str: + """If node path is a symlink, get link target as string. Does not + check that target path exists.""" + try: + if self.is_symlink(): + # use os.readlink() instead of Path.readlink() for + # Python <3.9 compatibility + return readlink(str(self.path)) + except Exception as ex: + self.exception = CapturedException(ex, level=10) + class DirectoryNode(_TreeNode): TYPE = "directory" From 6b2aec344bb52a7f56184484a36e9e3039d718fd Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 22:46:56 +0200 Subject: [PATCH 089/131] skip test for missing permissions if on windows --- datalad_next/tests/test_tree.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index efdf9b8c..b2f82eee 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -13,6 +13,7 @@ with_tempfile, get_deeply_nested_structure, skip_wo_symlink_capability, + skip_if_on_windows, ok_good_symlink, ok_broken_symlink ) @@ -674,6 +675,7 @@ def test_print_tree_fails_for_nonexistent_directory(self): with assert_raises(ValueError): Tree(Path(nonexistent_dir), max_depth=1) + @skip_if_on_windows @with_tempfile def test_print_tree_permission_denied(self, path=None): """ From 5bfaf004ee26b4c33bc75243a9da31ace63d9c4c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 22:47:57 +0200 Subject: [PATCH 090/131] on windows, symlink loop raises OSError --- datalad_next/tests/test_tree.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index b2f82eee..44ee4ae5 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -736,8 +736,9 @@ def test_tree_with_broken_symlinks(self, path=None): # 2. symlink pointing to itself link_to_self = dpath / 'links' / 'link_to_self' link_to_self.symlink_to(link_to_self) - with assert_raises(RuntimeError): - link_to_self.resolve() # fails because of infinite loop + with assert_raises((RuntimeError, OSError)): # OSError on Windows + # resolution fails because of infinite loop + link_to_self.resolve() # test results dict using python API actual = TreeCommand.__call__( From e5a0e50b760e56e55873529fa5ac70a5104b2624 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 22:48:37 +0200 Subject: [PATCH 091/131] use platform-specific path separator for display of relative symlinks --- datalad_next/tests/test_tree.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 44ee4ae5..4400ce2c 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -1,5 +1,6 @@ from contextlib import contextmanager from pathlib import Path +from os import sep import pytest from datalad.distribution.dataset import Dataset @@ -790,20 +791,21 @@ def test_print_tree_with_recursive_symlinks(self, path=None): root = ds.path command = ["tree", "--depth", "2", root] _, actual_res, counts = get_tree_rendered_output(command) - expected_res = """ + s = sep + expected_res = f""" ├── directory_untracked/ -│ └── link2dir/ -> ../subdir -├── link2deepdir/ -> subds_modified/subdir/deepdir +│ └── link2dir/ -> ..{s}subdir +├── link2deepdir/ -> subds_modified{s}subdir{s}deepdir │ └── subdeepdir/ ├── link2dir/ -> subdir -├── link2extdir/ -> ../ext_dir +├── link2extdir/ -> ..{s}ext_dir │ └── ext_subdir/ -├── link2parent/ -> ../.. -├── link2subdsdir/ -> subds_modified/subdir +├── link2parent/ -> ..{s}.. +├── link2subdsdir/ -> subds_modified{s}subdir ├── link2subdsroot/ -> subds_modified ├── subdir/ └── [DS~1] subds_modified/ - ├── link2superdsdir/ -> ../subdir + ├── link2superdsdir/ -> ..{s}subdir ├── subdir/ └── [DS~2] subds_lvl1_modified/ """.lstrip("\n") From 26e2f178704634a1ae1c2f58d02712940879e76e Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 7 Aug 2022 23:27:50 +0200 Subject: [PATCH 092/131] catch OSError on windows for self-referencing symlink --- datalad_next/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 3519fe7a..159a605d 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -1011,7 +1011,7 @@ def is_broken_symlink(self) -> bool: return False except FileNotFoundError: return True - except RuntimeError: + except (RuntimeError, OSError): # OSError on Windows # if symlink loop, consider it broken symlink # (like UNIX 'tree' command does) return True From f3d7a17902f0fc131430cbfb8b854d6b4816c04c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 12 Aug 2022 22:11:43 +0200 Subject: [PATCH 093/131] add (failing) test for broken symlinks pointing to inaccessible files/directories --- datalad_next/tests/test_tree.py | 76 ++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 4400ce2c..42803294 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -708,48 +708,92 @@ def test_print_tree_permission_denied(self, path=None): assert_str_equal(expected, actual) @skip_wo_symlink_capability - @with_tempfile - def test_tree_with_broken_symlinks(self, path=None): - """Test that broken symlinks are reported as such""" + @with_tempfile # main temp dir to construct tree + @with_tempfile # separate temp dir to be symlinked (forbidden_dir) + def test_tree_with_broken_symlinks(self, path=None, forbidden_path=None): + """Test that broken symlinks are reported as such. + TODO split into separate tests""" + # prep dpath = Path(path) dir1 = dpath / 'real' / 'dir1' file1 = dpath / 'real' / 'dir1' / 'file1' + forbidden_file = dpath / 'real' / 'dir1' / 'forbidden_file' dir1.mkdir(parents=True) file1.touch() + forbidden_file.touch() # permissions will be removed later ad-hoc + + # create directory without permissions (outside of main tree) + forbidden_dir = Path(forbidden_path) + forbidden_dir.mkdir() + file_in_forbidden_dir = forbidden_dir / 'file_in_forbidden_dir' + file_in_forbidden_dir.touch() # create good symlinks (dpath / 'links').mkdir() - # 1. symlink pointing to directory + + # 1. symlink pointing to existing directory link_to_dir1 = dpath / 'links' / 'link_to_dir1' link_to_dir1.symlink_to(dir1, target_is_directory=True) ok_good_symlink(link_to_dir1) - # 2. symlink pointing to file + + # 2. symlink pointing to existing file link_to_file1 = dpath / 'links' / 'link_to_file1' link_to_file1.symlink_to(file1) ok_good_symlink(link_to_file1) + # 3. symlink pointing to existing but inaccessible directory + link_to_forbidden_dir = dpath / 'links' / 'link_to_forbidden_dir' + link_to_forbidden_dir.symlink_to(forbidden_dir, target_is_directory=True) + with ensure_no_permissions(forbidden_dir): + ok_good_symlink(link_to_forbidden_dir) + + # 4. symlink pointing to existing but inaccessible file + link_to_forbidden_file = dpath / 'links' / 'link_to_forbidden_file' + link_to_forbidden_file.symlink_to(forbidden_file) + with ensure_no_permissions(forbidden_file): + ok_good_symlink(link_to_forbidden_file) + # create bad symlinks # 1. symlink pointing to non-existent target - link_to_nonexistent = dpath / 'links' / 'link_to_nonexistent' + link_to_nonexistent = dpath / 'links' / '1_link_to_nonexistent' link_to_nonexistent.symlink_to(dpath / 'nonexistent') ok_broken_symlink(link_to_nonexistent) # 2. symlink pointing to itself - link_to_self = dpath / 'links' / 'link_to_self' + link_to_self = dpath / 'links' / '2_link_to_self' link_to_self.symlink_to(link_to_self) with assert_raises((RuntimeError, OSError)): # OSError on Windows # resolution fails because of infinite loop link_to_self.resolve() - # test results dict using python API - actual = TreeCommand.__call__( - dpath, - include_files=True, - result_renderer="disabled", - result_xfm="paths", - result_filter=lambda res: res.get("is_broken_symlink", False) - ) - expected = [str(link_to_nonexistent), str(link_to_self)] + # 3. symlink pointing to file under inaccessible directory + link_to_file_in_forbidden_dir = dpath / 'links' / '3_link_to_file_in_forbidden_dir' + link_to_file_in_forbidden_dir.symlink_to(file_in_forbidden_dir) + with ensure_no_permissions(forbidden_dir): + with assert_raises(PermissionError): + # resolution fails because of missing permissions + link_to_file_in_forbidden_dir.resolve(strict=True) + + # temporarily remove all permissions (octal 000) + # restore permissions at the end, otherwise we can't delete temp dir + with ensure_no_permissions(forbidden_dir), \ + ensure_no_permissions(forbidden_file): + + # test results dict using python API + # implicitly also tests that command yields tree without crashing + actual = TreeCommand.__call__( + dpath, + include_files=True, + result_renderer="disabled", + result_xfm="paths", + result_filter=lambda res: res.get("is_broken_symlink", False), + on_failure="ignore" + ) + expected = [ + str(link_to_nonexistent), + str(link_to_self), + str(link_to_file_in_forbidden_dir) + ] assert expected == actual @skip_wo_symlink_capability From 322e763ac8ffeeba004eb329eb2e0095e1d8a90c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 12 Aug 2022 22:13:42 +0200 Subject: [PATCH 094/131] handle permission error when symlink points to file under inaccessible directory --- datalad_next/tree.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 159a605d..edf4b4e7 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -725,11 +725,22 @@ def _generate_tree_nodes(self, dir_path: Path): else: self.exhausted_levels.discard(self.path_depth(child)) - if child.is_dir(): - # recurse into subdirectories - yield from self._generate_tree_nodes(child) - else: - yield FileNode(child, self.path_depth(child)) + try: + # `child.is_dir()` could fail because of permission error + # error if node is symlink pointing to contents under + # inaccessible directory + if child.is_dir(): + # recurse into subdirectories + yield from self._generate_tree_nodes(child) + else: + yield FileNode(child, self.path_depth(child)) + except OSError as ex: + # assume it's a file + yield FileNode( + child, + self.path_depth(child), + exception=CapturedException(ex, level=10) + ) @increment_node_count def generate_nodes(self): @@ -1042,6 +1053,7 @@ def __init__(self, *args, **kwargs): # get first child if exists. this is a check for whether # we can (potentially) recurse into the directory or # if there are any filesystem issues (permissions errors, etc) + # TODO: replace with generic `Path.stat()`? (follows symlinks) any(self.path.iterdir()) except OSError as ex: # permission errors etc. are logged and stored as node From 69fa59118ea4e3c108e0dc5f324b9f53144a821b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Fri, 12 Aug 2022 22:14:44 +0200 Subject: [PATCH 095/131] validate input of is_broken_symlink(), catch PermissionError as separate case --- datalad_next/tree.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index edf4b4e7..9a969c5f 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -1014,26 +1014,29 @@ def is_symlink(self) -> bool: return False def is_broken_symlink(self) -> bool: - """If node path is a symlink, check if it points to a non-existing - target or to itself (self-referencing link)""" + """If node path is a symlink, check if it points to a nonexisting + or inaccessible target or to itself (self-referencing link). Raise + exception if the node path is not a symlink.""" + if not self.is_symlink(): + raise ValueError("Node path is not a symlink, cannot check if " + f"symlink is broken: {self.path}") + try: - if self.is_symlink(): - self.path.resolve(strict=True) - return False - except FileNotFoundError: + self.path.resolve(strict=True) + return False + except FileNotFoundError: # target does not exist return True - except (RuntimeError, OSError): # OSError on Windows - # if symlink loop, consider it broken symlink - # (like UNIX 'tree' command does) + except PermissionError: # target exists but is not accessible return True - except Exception as ex: - # probably broken in some way + except (RuntimeError, OSError): # symlink loop (OSError on Windows) + return True + except Exception as ex: # probably broken in some other way self.exception = CapturedException(ex, level=10) return True def get_symlink_target(self) -> str: - """If node path is a symlink, get link target as string. Does not - check that target path exists.""" + """If node path is a symlink, get link target as string. Otherwise, + return None. Does not check that target path exists.""" try: if self.is_symlink(): # use os.readlink() instead of Path.readlink() for From 6db0e12fdef117c53f4dd7c84fa455e7e740596a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 02:11:29 +0200 Subject: [PATCH 096/131] discard exhausted_levels deeper than the current node's depth (not needed) --- datalad_next/tests/test_tree.py | 27 +++++++++++++++++++++++++++ datalad_next/tree.py | 15 +++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 42803294..8caf5f4e 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -315,6 +315,9 @@ class TestTree: ], "test_print_stats": [ "depth", "include_files", "include_hidden", "expected_stats_str" + ], + "test_exhausted_levels_are_below_current_depth": [ + "depth", "include_files", "include_hidden" ] } @@ -522,6 +525,30 @@ def test_print_tree_depth_zero(self): expected = (root, '', '0 datasets, 0 directories, 0 files') assert expected == actual + def test_exhausted_levels_are_below_current_depth( + self, depth, include_files, include_hidden): + """For each node, the exhausted levels reported for that node + should be smaller or equal to the node's depth""" + + results = TreeCommand.__call__( + self.path, + depth=depth, + include_files=include_files, + include_hidden=include_hidden, + result_renderer="disabled", + # return only 'depth' and 'exhausted_levels' from result dicts + result_xfm=lambda res: {k: res[k] + for k in ("depth", "exhausted_levels")} + ) + # sanity checks + assert len(results) > 1 + assert any(res["exhausted_levels"] for res in results) + + # actual test + assert all(level <= res["depth"] + for res in results + for level in res["exhausted_levels"]) + @pytest.mark.usefixtures("inject_path_ds") class TestTreeWithDatasets(TestTreeWithoutDatasets): diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 9a969c5f..d0839f11 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -589,10 +589,10 @@ def __init__(self, # will not be yielded by the node generator self.exclude_node_func = exclude_node_func or self.default_exclude_func - # keep track of levels where the subtree is exhausted, - # i.e. we have reached the last child of the current subtree. - # this is needed for the custom results renderer, to display - # nodes differently based on their relative position in the tree. + # keep track of levels where the subtree is exhausted, i.e. we + # have reached the last node of the current subtree. + # this is needed for the custom results renderer, to display nodes + # differently depending on whether they are the last child or not. self.exhausted_levels = set([]) # store dict with count of nodes for each node type, similar to the @@ -725,6 +725,13 @@ def _generate_tree_nodes(self, dir_path: Path): else: self.exhausted_levels.discard(self.path_depth(child)) + # remove exhausted levels that are deeper than the + # current depth (we don't need them anymore) + levels = set(self.exhausted_levels) # copy + self.exhausted_levels.difference_update( + l for l in levels if l > self.path_depth(child) + ) + try: # `child.is_dir()` could fail because of permission error # error if node is symlink pointing to contents under From 9c7990b59a2d2bb7c9bb4dac8941ce11fe84b4db Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 13:53:26 +0200 Subject: [PATCH 097/131] replace 'with_tempfile' decorator with fixture to allow passing multiple fixtures to test methods --- datalad_next/tests/test_tree.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 8caf5f4e..907ea01d 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -84,6 +84,18 @@ def create_temp_dir_tree(tree_dict: dict) -> Path: return Path(temp_dir_root).resolve() +@pytest.fixture(scope="function") +def path(): + """Generic fixture for creating a temporary directory tree. + + TODO: harness pytest's native ``tmp_path`` / ``tmp_path_factory`` + fixtures""" + temp_dir_root = create_temp_dir_tree({}) # empty directory + yield temp_dir_root + rmtemp(temp_dir_root) + assert not temp_dir_root.exists() + + @pytest.fixture(scope="module") def path_no_ds(): """Fixture for creating a temporary directory tree (**without** datasets) @@ -704,8 +716,7 @@ def test_print_tree_fails_for_nonexistent_directory(self): Tree(Path(nonexistent_dir), max_depth=1) @skip_if_on_windows - @with_tempfile - def test_print_tree_permission_denied(self, path=None): + def test_print_tree_permission_denied(self, path): """ - If the tree contains a directory for which the user has no permissions (so it would not be possible to traverse it), a message @@ -720,7 +731,7 @@ def test_print_tree_permission_denied(self, path=None): # temporarily remove all permissions (octal 000) # restore permissions at the end, otherwise we can't delete temp dir with ensure_no_permissions(forbidden_dir): - command = ['tree', path, '--depth', '2'] + command = ['tree', str(path), '--depth', '2'] # expect exit code 1 _, actual, _ = get_tree_rendered_output(command, exit_code=1) expected = f""" @@ -824,8 +835,7 @@ def test_tree_with_broken_symlinks(self, path=None, forbidden_path=None): assert expected == actual @skip_wo_symlink_capability - @with_tempfile - def test_print_tree_with_recursive_symlinks(self, path=None): + def test_print_tree_with_recursive_symlinks(self, path): """ TODO: break down into separate tests @@ -836,14 +846,13 @@ def test_print_tree_with_recursive_symlinks(self, path=None): themselves, but regular directories (to prevent duplicate counts of datasets) """ - parent = Path(path) - ds = get_deeply_nested_structure(str(parent / 'superds')) + ds = get_deeply_nested_structure(str(path / 'superds')) # change current dir to create symlinks with relative path with chpwd(ds.path): # create symlink to a sibling directory of the tree # (should be recursed into) - (parent / 'ext_dir' / 'ext_subdir').mkdir(parents=True) + (path / 'ext_dir' / 'ext_subdir').mkdir(parents=True) Path('link2extdir').symlink_to(Path('..') / 'ext_dir', target_is_directory=True) From 858ab68f9d6f218a9f52642904ca03728e9d9e6c Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 13:55:26 +0200 Subject: [PATCH 098/131] split test for broken symlinks with vs without permission errors (to allow skipping permission tests on windows) --- datalad_next/tests/test_tree.py | 171 +++++++++++++++++++++----------- 1 file changed, 113 insertions(+), 58 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 907ea01d..617a72a3 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -11,7 +11,6 @@ assert_str_equal, with_tree, ok_exists, - with_tempfile, get_deeply_nested_structure, skip_wo_symlink_capability, skip_if_on_windows, @@ -746,72 +745,117 @@ def test_print_tree_permission_denied(self, path): assert_str_equal(expected, actual) @skip_wo_symlink_capability - @with_tempfile # main temp dir to construct tree - @with_tempfile # separate temp dir to be symlinked (forbidden_dir) - def test_tree_with_broken_symlinks(self, path=None, forbidden_path=None): - """Test that broken symlinks are reported as such. - TODO split into separate tests""" + @pytest.mark.parametrize("include_files", (True, False)) + def test_tree_with_broken_symlinks(self, path, include_files): + """Test that broken symlinks are reported as such""" # prep - dpath = Path(path) - dir1 = dpath / 'real' / 'dir1' - file1 = dpath / 'real' / 'dir1' / 'file1' - forbidden_file = dpath / 'real' / 'dir1' / 'forbidden_file' + dir1 = path / 'real' / 'dir1' + file1 = path / 'real' / 'dir1' / 'file1' dir1.mkdir(parents=True) file1.touch() - forbidden_file.touch() # permissions will be removed later ad-hoc - - # create directory without permissions (outside of main tree) - forbidden_dir = Path(forbidden_path) - forbidden_dir.mkdir() - file_in_forbidden_dir = forbidden_dir / 'file_in_forbidden_dir' - file_in_forbidden_dir.touch() + (path / 'links').mkdir() - # create good symlinks - (dpath / 'links').mkdir() + # create symlinks + # 1. broken symlink pointing to non-existent target + link_to_nonexistent = path / 'links' / '1_link_to_nonexistent' + link_to_nonexistent.symlink_to(path / 'nonexistent') + ok_broken_symlink(link_to_nonexistent) + # 2. broken symlink pointing to itself + link_to_self = path / 'links' / '2_link_to_self' + link_to_self.symlink_to(link_to_self) + with assert_raises((RuntimeError, OSError)): # OSError on Windows + # resolution should fail because of infinite loop + link_to_self.resolve() - # 1. symlink pointing to existing directory - link_to_dir1 = dpath / 'links' / 'link_to_dir1' + # 3. good symlink pointing to existing directory + link_to_dir1 = path / 'links' / '3_link_to_dir1' link_to_dir1.symlink_to(dir1, target_is_directory=True) ok_good_symlink(link_to_dir1) - - # 2. symlink pointing to existing file - link_to_file1 = dpath / 'links' / 'link_to_file1' + # 4. good symlink pointing to existing file + link_to_file1 = path / 'links' / '4_link_to_file1' link_to_file1.symlink_to(file1) ok_good_symlink(link_to_file1) - # 3. symlink pointing to existing but inaccessible directory - link_to_forbidden_dir = dpath / 'links' / 'link_to_forbidden_dir' - link_to_forbidden_dir.symlink_to(forbidden_dir, target_is_directory=True) - with ensure_no_permissions(forbidden_dir): - ok_good_symlink(link_to_forbidden_dir) + # test results dict using python API + # implicitly also tests that command yields tree without crashing + actual = TreeCommand.__call__( + path, + depth=None, # unlimited + include_files=include_files, + result_renderer="disabled", + result_xfm=lambda res: (Path(res["path"]).name, + res["is_broken_symlink"]), + result_filter=lambda res: "is_broken_symlink" in res, + return_type="list", + on_failure="ignore" + ) - # 4. symlink pointing to existing but inaccessible file - link_to_forbidden_file = dpath / 'links' / 'link_to_forbidden_file' - link_to_forbidden_file.symlink_to(forbidden_file) - with ensure_no_permissions(forbidden_file): - ok_good_symlink(link_to_forbidden_file) + if include_files: + expected = [ + # (path, is_broken_symlink) + (link_to_nonexistent.name, True), + (link_to_self.name, True), + (link_to_dir1.name, False), + (link_to_file1.name, False) + ] + else: + expected = [ + (link_to_dir1.name, False) + ] + assert set(expected) == set(actual) - # create bad symlinks - # 1. symlink pointing to non-existent target - link_to_nonexistent = dpath / 'links' / '1_link_to_nonexistent' - link_to_nonexistent.symlink_to(dpath / 'nonexistent') - ok_broken_symlink(link_to_nonexistent) + @skip_if_on_windows + @pytest.mark.parametrize("include_files", (True, False)) + def test_tree_with_broken_symlinks_to_inaccessible_targets( + self, path, include_files): + """Test that symlinks to targets underneath inaccessible directories + are reported as broken, whereas symlinks to inaccessible + file/directories themselves are not reported as broken.""" + # prep + root = path / "root" # tree root + root.mkdir(parents=True) - # 2. symlink pointing to itself - link_to_self = dpath / 'links' / '2_link_to_self' - link_to_self.symlink_to(link_to_self) - with assert_raises((RuntimeError, OSError)): # OSError on Windows - # resolution fails because of infinite loop - link_to_self.resolve() + # create file and directory without permissions outside of tree + # root (permissions will be removed later ad-hoc, because need to + # create symlinks first) + forbidden_file = path / "forbidden_file" + forbidden_file.touch() # permissions will be removed later ad-hoc + forbidden_dir = path / "forbidden_dir" + forbidden_dir.mkdir() + file_in_forbidden_dir = forbidden_dir / "file_in_forbidden_dir" + file_in_forbidden_dir.touch() + dir_in_forbidden_dir = forbidden_dir / "dir_in_forbidden_dir" + dir_in_forbidden_dir.mkdir() - # 3. symlink pointing to file under inaccessible directory - link_to_file_in_forbidden_dir = dpath / 'links' / '3_link_to_file_in_forbidden_dir' + # create symlinks + # 1. broken symlink pointing to file under inaccessible directory + link_to_file_in_forbidden_dir = root / "1_link_to_file_in_forbidden_dir" link_to_file_in_forbidden_dir.symlink_to(file_in_forbidden_dir) with ensure_no_permissions(forbidden_dir): with assert_raises(PermissionError): - # resolution fails because of missing permissions + # resolution should fail because of missing permissions link_to_file_in_forbidden_dir.resolve(strict=True) + # 2. broken symlink pointing to directory under inaccessible directory + link_to_dir_in_forbidden_dir = root / "2_link_to_dir_in_forbidden_dir" + link_to_dir_in_forbidden_dir.symlink_to(dir_in_forbidden_dir) + with ensure_no_permissions(forbidden_dir): + with assert_raises(PermissionError): + # resolution should fail because of missing permissions + link_to_dir_in_forbidden_dir.resolve(strict=True) + + # 3. good symlink pointing to existing but inaccessible directory + link_to_forbidden_dir = root / "3_link_to_forbidden_dir" + link_to_forbidden_dir.symlink_to(forbidden_dir, target_is_directory=True) + with ensure_no_permissions(forbidden_dir): + ok_good_symlink(link_to_forbidden_dir) + + # 4. good symlink pointing to existing but inaccessible file + link_to_forbidden_file = root / "4_link_to_forbidden_file" + link_to_forbidden_file.symlink_to(forbidden_file) + with ensure_no_permissions(forbidden_file): + ok_good_symlink(link_to_forbidden_file) + # temporarily remove all permissions (octal 000) # restore permissions at the end, otherwise we can't delete temp dir with ensure_no_permissions(forbidden_dir), \ @@ -820,19 +864,30 @@ def test_tree_with_broken_symlinks(self, path=None, forbidden_path=None): # test results dict using python API # implicitly also tests that command yields tree without crashing actual = TreeCommand.__call__( - dpath, - include_files=True, + root, + depth=None, + include_files=include_files, result_renderer="disabled", - result_xfm="paths", - result_filter=lambda res: res.get("is_broken_symlink", False), + result_xfm=lambda res: (Path(res["path"]).name, + res["is_broken_symlink"]), + result_filter=lambda res: "is_broken_symlink" in res, + return_type="list", on_failure="ignore" ) - expected = [ - str(link_to_nonexistent), - str(link_to_self), - str(link_to_file_in_forbidden_dir) - ] - assert expected == actual + + if include_files: + expected = [ + # (path, is_broken_symlink) + (link_to_file_in_forbidden_dir.name, True), + (link_to_dir_in_forbidden_dir.name, True), + (link_to_forbidden_dir.name, False), + (link_to_forbidden_file.name, False) + ] + else: + expected = [ + (link_to_forbidden_dir.name, False) + ] + assert set(expected) == set(actual) @skip_wo_symlink_capability def test_print_tree_with_recursive_symlinks(self, path): From ad482ce174e8c6bff40920bd944d26c0a48dde3e Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 20:25:42 +0200 Subject: [PATCH 099/131] make uniform usage of pathlib's resolve() --- datalad_next/tree.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index d0839f11..f3638f15 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -504,7 +504,7 @@ def get_dataset_root_datalad_only(path: Path): return potential_ds_root # it's a match # we go one directory higher and try again - ds_root = Path.resolve(potential_ds_root / '..') + ds_root = (potential_ds_root / "..").resolve(strict=True) return ds_root @@ -522,13 +522,11 @@ def get_superdataset(path: Path): ------- Dataset or None """ - path = str(path) superds_path = None while path: - # normalize the path after adding .. so we guaranteed to not - # follow into original directory if path itself is a symlink - parent_path = Path.resolve(Path(path) / '..') + parent_path = (path / "..").resolve(strict=True) + sds_path_ = get_dataset_root_datalad_only(parent_path) if sds_path_ is None: # no more parents, use previous found @@ -537,12 +535,12 @@ def get_superdataset(path: Path): superds = Dataset(sds_path_) # test if path is registered subdataset of the parent - if not is_subds_of_parent(Path(path), superds.pathobj): + if not is_subds_of_parent(path, superds.pathobj): break # That was a good candidate superds_path = sds_path_ - path = str(parent_path) + path = parent_path break if superds_path is None: @@ -733,7 +731,7 @@ def _generate_tree_nodes(self, dir_path: Path): ) try: - # `child.is_dir()` could fail because of permission error + # `child.is_dir()` could fail because of permission # error if node is symlink pointing to contents under # inaccessible directory if child.is_dir(): @@ -996,6 +994,7 @@ def tree_root(self) -> Path: @property def parents(self): """List of parent paths in top-down order beginning from the tree root. + Assumes the node path to be already normalized. Returns ------- @@ -1114,7 +1113,7 @@ def calculate_dataset_depth(self): ds = self.ds while ds: - superds = get_superdataset(ds.path) + superds = get_superdataset(ds.pathobj) if superds is None: # it is not a dataset, do nothing From 5ba7bdbf3c2aba792d4176e88c4ee7e26a432750 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 20:32:56 +0200 Subject: [PATCH 100/131] handle permission error for input root path in Tree constructor --- datalad_next/tree.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index f3638f15..e48d8f95 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -575,9 +575,11 @@ def __init__(self, exclude_node_func: Callable or None Function to filter out tree nodes from the tree """ - self.root = root.resolve() - if not self.root.is_dir(): - raise ValueError(f"Directory not found: '{root}'") + self.root = root.resolve(strict=False) + try: + assert self.root.is_dir(), f"path is not a directory: {self.root}" + except (AssertionError, OSError) as ex: # could be permission error + raise ValueError(f"directory not found: '{root}'") from ex self.max_depth = max_depth if max_depth is not None and max_depth < 0: From 55bf11d2270704470371ffebe82a0ea389f1801a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 22:30:19 +0200 Subject: [PATCH 101/131] create nodes using single factory class to centralize error handling --- datalad_next/tree.py | 122 +++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index e48d8f95..fd56d14f 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -615,10 +615,6 @@ def path_depth(self, path: Path): Can also be a negative integer if the path is a parent of the tree root. - Parameters - ---------- - path: Path - Returns ------- int @@ -671,21 +667,22 @@ def is_recursive_symlink(self, dir_path: Path): abs(self.path_depth(target_dir)) <= self.max_depth def _generate_tree_nodes(self, dir_path: Path): - """Recursively yield ``_TreeNode`` objects starting from - ``dir_path`` + """Recursively yield ``_TreeNode`` objects starting from ``dir_path`` Parameters ---------- dir_path: Path Directory from which to calculate the tree """ - # yield current node - yield DirectoryOrDatasetNode(dir_path, self.path_depth(dir_path)) + # yield current directory/dataset node + current_depth = self.path_depth(dir_path) + current_node = Node(dir_path, current_depth) + yield current_node # check that we are within max_depth levels # (None means unlimited depth) if self.max_depth is None or \ - self.path_depth(dir_path) < self.max_depth: + current_depth < self.max_depth: if self.is_recursive_symlink(dir_path): # if symlink points to directory that we may visit or may @@ -694,21 +691,23 @@ def _generate_tree_nodes(self, dir_path: Path): f"will not traverse target directory: '{dir_path}'") return - try: - # sort child nodes alphabetically - # needs to be done *before* calling the exclusion function, - # because the function may depend on sort order - all_children = sorted(list(dir_path.iterdir())) - except OSError: - # do not recurse into children. - # the error should have been already stored as - # `CapturedException` in the `exception` attribute of the - # current parent node on creation. + if current_node.exception is not None: + # if some exception occurred when instantiating the node + # (missing permissions etc), do not recurse into directory + lgr.debug("Node has exception, will not traverse directory: " + f"path={current_node.path}, exc={current_node.exception}") return + # sort child nodes alphabetically + # needs to be done *before* calling the exclusion function, + # because the function may depend on sort order + all_children = sorted(list(dir_path.iterdir())) + child_depth = current_depth + 1 + # apply exclusion filters children = ( - p for p in all_children + Node(p, child_depth) + for p in all_children if not self.exclude_node_func(p) ) @@ -721,33 +720,23 @@ def _generate_tree_nodes(self, dir_path: Path): for is_last_child, child in yield_with_last_item(children): if is_last_child: # last child of its subtree - self.exhausted_levels.add(self.path_depth(child)) + self.exhausted_levels.add(child_depth) else: - self.exhausted_levels.discard(self.path_depth(child)) + self.exhausted_levels.discard(child_depth) # remove exhausted levels that are deeper than the # current depth (we don't need them anymore) levels = set(self.exhausted_levels) # copy self.exhausted_levels.difference_update( - l for l in levels if l > self.path_depth(child) + l for l in levels if l > child_depth ) - try: - # `child.is_dir()` could fail because of permission - # error if node is symlink pointing to contents under - # inaccessible directory - if child.is_dir(): - # recurse into subdirectories - yield from self._generate_tree_nodes(child) - else: - yield FileNode(child, self.path_depth(child)) - except OSError as ex: - # assume it's a file - yield FileNode( - child, - self.path_depth(child), - exception=CapturedException(ex, level=10) - ) + if isinstance(child, (DirectoryNode, DatasetNode)): + # recurse into subdirectories + yield from self._generate_tree_nodes(child.path) + else: + # it's a file, just yield it + yield child @increment_node_count def generate_nodes(self): @@ -913,7 +902,7 @@ def exclude(p: Path): if parent not in visited: visited.add(parent) - yield DirectoryOrDatasetNode(parent, depth) + yield Node(parent, depth) visited.add(node.path) yield node @@ -977,7 +966,6 @@ def __init__(self, path: Path, depth: int, """ self.path = path self.depth = depth - # TODO: should be error collection / list of exceptions? self.exception = exception def __eq__(self, other): @@ -1054,6 +1042,34 @@ def get_symlink_target(self) -> str: self.exception = CapturedException(ex, level=10) +class Node: + """ + Factory class for creating a ``_TreeNode`` of a particular subclass. + Detects whether the path is a file or a directory or dataset, + and handles any exceptions (permission errors, broken symlinks, etc.) + """ + def __new__(cls, path: Path, depth: int, **kwargs): + node_cls = FileNode + captured_ex = None + try: + if path.is_dir(): + if is_dataset(path): + node_cls = DatasetNode + else: + node_cls = DirectoryNode + except NoDatasetFound as ex: # means 'is_dataset()' failed + # default to directory node + # just log the exception, do not set it as node attribute + CapturedException(ex, level=10) + node_cls = DirectoryNode + except Exception as ex: # means 'is_dir()' failed + # default to file node + # set exception as node attribute + captured_ex = CapturedException(ex, level=10) + + return node_cls(path, depth, exception=captured_ex, **kwargs) + + class DirectoryNode(_TreeNode): TYPE = "directory" @@ -1062,14 +1078,15 @@ def __init__(self, *args, **kwargs): try: # get first child if exists. this is a check for whether - # we can (potentially) recurse into the directory or + # we can potentially recurse into the directory or # if there are any filesystem issues (permissions errors, etc) - # TODO: replace with generic `Path.stat()`? (follows symlinks) any(self.path.iterdir()) except OSError as ex: # permission errors etc. are logged and stored as node # attribute so they can be passed to results dict. - # this will overwrite any exception passed to the constructor. + # this will overwrite any exception passed to the constructor, + # since we assume that this exception is closer to the root + # cause. self.exception = CapturedException(ex, level=10) # DEBUG level @@ -1134,20 +1151,3 @@ def calculate_dataset_depth(self): ds = superds return ds_depth, ds_absolute_depth - - -class DirectoryOrDatasetNode: - """Factory class for creating either a ``DirectoryNode`` or - ``DatasetNode``, based on whether the path is a dataset or not. - """ - def __new__(cls, path, *args, **kwargs): - try: - is_ds = is_dataset(path) # could fail because of permissions etc. - except Exception as ex: - # if dataset detection has failed, we fall back to a - # `DirectoryNode` with the exception stored as attribute - ce = CapturedException(ex, level=10) - return DirectoryNode(path, *args, exception=ce, **kwargs) - - node_cls = DatasetNode if is_ds else DirectoryNode - return node_cls(path, *args, **kwargs) From 46c08815e9e64f440bacc89074e5eed33b4a8fa9 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 13 Aug 2022 23:48:41 +0200 Subject: [PATCH 102/131] exclude_node_func() now accepts node object as arg instead of path --- datalad_next/tree.py | 84 ++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index fd56d14f..1402ca99 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -327,10 +327,10 @@ def build_excluded_node_func(include_hidden=False, include_files=False): and returns true if the node should *not* be displayed in the tree. """ - def is_excluded(path): + def is_excluded(node: _TreeNode): return any(( - not path.is_dir() if not include_files else False, - path.name.startswith(".") if not include_hidden else False + isinstance(node, FileNode) if not include_files else False, + node.path.name.startswith(".") if not include_hidden else False )) return is_excluded @@ -604,9 +604,11 @@ def __init__(self, for node_type in _TreeNode.__subclasses__()} @staticmethod - def default_exclude_func(path: Path): + def default_exclude_func(node): """By default, exclude files and hidden directories from the tree""" - return any((not path.is_dir(), path.name.startswith("."))) + return any( + (isinstance(node, FileNode), node.path.name.startswith(".")) + ) def path_depth(self, path: Path): """Calculate directory depth of a given path relative to the root of @@ -704,12 +706,12 @@ def _generate_tree_nodes(self, dir_path: Path): all_children = sorted(list(dir_path.iterdir())) child_depth = current_depth + 1 - # apply exclusion filters - children = ( - Node(p, child_depth) - for p in all_children - if not self.exclude_node_func(p) - ) + # generator to apply exclusion filter + def children(): + for child_path in all_children: + child_node = Node(child_path, child_depth) + if not self.exclude_node_func(child_node): + yield child_node # exclusion function could be expensive to compute, so we # use a generator for child nodes. however, we need to be able @@ -717,7 +719,7 @@ def _generate_tree_nodes(self, dir_path: Path): # displaying special end-of-subtree prefix). so we wrap the # generator in another 'lookahead' generator to detect the last # item. - for is_last_child, child in yield_with_last_item(children): + for is_last_child, child in yield_with_last_item(children()): if is_last_child: # last child of its subtree self.exhausted_levels.add(child_depth) @@ -801,7 +803,7 @@ def generate_nodes(self): Generator[_TreeNode] """ - def exclude_func(path: Path): + def exclude_func(node: _TreeNode): """Exclusion function -- here is the crux of the logic for pruning the main tree.""" @@ -811,9 +813,9 @@ def exclude_func(path: Path): self._next_ds.depth is None: # dummy depth self._advance_ds_generator() - if path.is_dir() and is_dataset(path): + if isinstance(node, DatasetNode): # check if maximum dataset depth is exceeded - is_valid_ds = self._is_valid_dataset(path) + is_valid_ds = self._is_valid_dataset(node) if is_valid_ds: self._advance_ds_generator() # go to next dataset(-parent) return not is_valid_ds @@ -823,15 +825,15 @@ def exclude_func(path: Path): # unless (in case of a directory) it is itself the parent of a # valid dataset. if it's a parent of a dataset, we don't apply # any filters -- it's just a means to get to the next dataset. - if not self._is_parent_of_ds(path): - return self.exclude_node_func(path) or \ - self._ds_child_node_exceeds_max_depth(path) + if not self._is_parent_of_ds(node): + return self.exclude_node_func(node) or \ + self._ds_child_node_exceeds_max_depth(node) except Exception as ex: CapturedException(ex, level=10) # DEBUG level - lgr.debug(f"Excluding path '{path}' from tree because " + lgr.debug(f"Excluding node from tree because " "an exception occurred while applying the " - "exclusion filter.") + f"exclusion filter: '{node.path}'") return True # exclude by default return False # do not exclude @@ -872,10 +874,10 @@ def _generate_datasets(self): Generator[DirectoryNode or DatasetNode] """ - def exclude(p: Path): + def exclude(n: _TreeNode): # we won't find any datasets underneath the git folder - return not p.is_dir() or \ - (p.is_dir() and p.name == ".git") + return isinstance(n, FileNode) or \ + (isinstance(n, DirectoryNode) and n.path.name == ".git") ds_tree = Tree( self.root, @@ -894,7 +896,7 @@ def exclude(p: Path): # yield the dataset itself if isinstance(node, DatasetNode) and \ node.ds_depth <= self.max_dataset_depth and \ - not self.exclude_node_func(node.path): + not self.exclude_node_func(node): # yield parent directories if not already done parents_below_root = node.parents[1:] # first parent is root @@ -907,39 +909,38 @@ def exclude(p: Path): visited.add(node.path) yield node - def _is_valid_dataset(self, path: Path): - return path.is_dir() and \ - is_path_relative_to(path, self.root) and \ - is_dataset(path) and \ - not self.exclude_node_func(path) and \ - not self._ds_exceeds_max_ds_depth(path) + def _is_valid_dataset(self, node): + return isinstance(node, DatasetNode) and \ + is_path_relative_to(node.path, self.root) and \ + not self.exclude_node_func(node) and \ + not self._ds_exceeds_max_ds_depth(node) - def _ds_exceeds_max_ds_depth(self, path: Path): - ds = DatasetNode(path, self.path_depth(path)) - return ds.ds_depth > self.max_dataset_depth + def _ds_exceeds_max_ds_depth(self, ds_node): + return ds_node.ds_depth > self.max_dataset_depth - def _ds_child_node_exceeds_max_depth(self, path: Path): - ds_parent = get_dataset_root_datalad_only(path) + def _ds_child_node_exceeds_max_depth(self, ds_node): + ds_parent = get_dataset_root_datalad_only(ds_node.path) if ds_parent is None: return True # it's not a dataset child, we exclude it - if not self._is_valid_dataset(ds_parent): + ds_parent_depth = self.path_depth(ds_parent) + if not self._is_valid_dataset(Node(ds_parent, ds_parent_depth)): return True # also exclude # check directory depth relative to the dataset parent - rel_depth = self.path_depth(path) - self.path_depth(ds_parent) + rel_depth = ds_node.depth - ds_parent_depth assert rel_depth >= 0, "relative depth from parent cannot be < 0 " \ - f"(path: '{path}', parent: '{ds_parent}')" + f"(path: '{ds_node.path}', parent: '{ds_parent}')" return rel_depth > self.max_depth - def _is_parent_of_ds(self, path: Path): - if not path.is_dir(): + def _is_parent_of_ds(self, node): + if isinstance(node, FileNode): return False # files can't be parents if self._next_ds is None: return False # no more datasets, can't be a parent - if self._next_ds.path == path: + if self._next_ds.path == node.path: # we hit a dataset or the parent of a dataset self._advance_ds_generator() return True @@ -1130,7 +1131,6 @@ def calculate_dataset_depth(self): ds_absolute_depth = 0 ds = self.ds - while ds: superds = get_superdataset(ds.pathobj) From ae574f6d062619be13540942b7b9015796a1d10d Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sun, 14 Aug 2022 00:54:41 +0200 Subject: [PATCH 103/131] move is_recursive_symlink() to _TreeNode method, extract path_depth() out of Tree class --- datalad_next/tree.py | 155 ++++++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 68 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 1402ca99..20c12f12 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -383,6 +383,28 @@ def yield_with_last_item(generator): yield True, prev_val +def path_depth(path: Path, root: Path): + """Calculate directory depth of a path relative to the given root. + + Can also be a negative integer if the path is a parent of the + tree root. + + Returns + ------- + int + Number of levels of the given path *below* the root (positive + integer) or *above* the tree root (negative integer) + """ + if is_path_relative_to(path, root): + return len(path.relative_to(root).parts) + elif is_path_relative_to(root, path): + return - len(root.relative_to(path).parts) + else: + raise ValueError("Could not calculate directory depth: " + f"'{path}' is not relative to the tree root " + f"'{root}' (or vice-versa)") + + def is_empty_dir(path: Path): return path.is_dir() and not any(path.iterdir()) @@ -611,62 +633,7 @@ def default_exclude_func(node): ) def path_depth(self, path: Path): - """Calculate directory depth of a given path relative to the root of - the tree. - - Can also be a negative integer if the path is a parent of the - tree root. - - Returns - ------- - int - Number of levels of the given path *below* the tree root (positive - integer) or *above* the tree root (negative integer) - """ - if is_path_relative_to(path, self.root): - return len(path.relative_to(self.root).parts) - elif is_path_relative_to(self.root, path): - return - len(self.root.relative_to(path).parts) - else: - raise ValueError("Could not calculate directory depth: " - f"'{path}' is not relative to the tree root " - f"'{self.root}' (or vice-versa)") - - def is_recursive_symlink(self, dir_path: Path): - """Detect symlink pointing to a directory within the same tree - (directly or indirectly). - - The default behaviour is to follow symlinks. However, we do not follow - symlinks to directories that we may visit or have visited already, - i.e. are also located under the tree root or any parent of - the tree root (within a distance of ``max_depth``). - - Otherwise, the same subtree could be generated multiple times in - different places, potentially in a recursive loop (e.g. if the - symlink points to its parent). - - This is similar to the logic of the UNIX 'tree' command, but goes a - step further to prune all duplicate subtrees. - """ - if not dir_path.is_symlink(): - return False - - if not dir_path.is_dir(): - # we are only interested in symlinks pointing to a directory - raise ValueError("Path must be a directory") - - target_dir = dir_path.resolve() - - if is_path_relative_to(target_dir, self.root) or \ - is_path_relative_to(self.root, target_dir): - # either: - # - target dir is within `max_depth` levels beneath the tree - # root, so it will likely be yielded or has already been - # yielded (bar any exclusion filters) - # - target dir is a parent of the tree root, so we may still - # get into a loop if we recurse more than `max_depth` levels - return self.max_depth is None or \ - abs(self.path_depth(target_dir)) <= self.max_depth + return path_depth(path, self.root) def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from ``dir_path`` @@ -686,7 +653,8 @@ def _generate_tree_nodes(self, dir_path: Path): if self.max_depth is None or \ current_depth < self.max_depth: - if self.is_recursive_symlink(dir_path): + if current_node.is_symlink() and \ + current_node.is_recursive_symlink(self.max_depth): # if symlink points to directory that we may visit or may # have visited already, do not recurse into it lgr.debug(f"Symlink is potentially recursive, " @@ -983,7 +951,7 @@ def tree_root(self) -> Path: else self.path # we are the root @property - def parents(self): + def parents(self) -> list[Path]: """List of parent paths in top-down order beginning from the tree root. Assumes the node path to be already normalized. @@ -1010,6 +978,17 @@ def is_symlink(self) -> bool: self.exception = CapturedException(ex, level=10) return False + def get_symlink_target(self) -> str: + """If node path is a symlink, get link target as string. Otherwise, + return None. Does not check that target path exists.""" + try: + if self.is_symlink(): + # use os.readlink() instead of Path.readlink() for + # Python <3.9 compatibility + return readlink(str(self.path)) + except Exception as ex: + self.exception = CapturedException(ex, level=10) + def is_broken_symlink(self) -> bool: """If node path is a symlink, check if it points to a nonexisting or inaccessible target or to itself (self-referencing link). Raise @@ -1031,16 +1010,56 @@ def is_broken_symlink(self) -> bool: self.exception = CapturedException(ex, level=10) return True - def get_symlink_target(self) -> str: - """If node path is a symlink, get link target as string. Otherwise, - return None. Does not check that target path exists.""" - try: - if self.is_symlink(): - # use os.readlink() instead of Path.readlink() for - # Python <3.9 compatibility - return readlink(str(self.path)) - except Exception as ex: - self.exception = CapturedException(ex, level=10) + def is_recursive_symlink(self, max_depth) -> bool: + """Detect symlink pointing to a directory within the same tree + (directly or indirectly). + + The default behaviour is to follow symlinks when traversing the tree. + However, we should not follow symlinks to directories that we may + visit or have visited already, i.e. are also located under the tree + root or any parent of the tree root (within a distance of + ``max_depth``). + + Otherwise, the same subtree could be generated multiple times in + different places, potentially in a recursive loop (e.g. if the + symlink points to its parent). + + This is similar to the logic of the UNIX 'tree' command, but goes a + step further to prune all duplicate subtrees. + + Parameters + ---------- + max_depth + Max depth of the ``Tree`` to which this node belongs + """ + if not self.is_symlink(): + raise ValueError("Node path is not a symlink, cannot check if " + f"symlink is recursive: {self.path}") + + if isinstance(self, FileNode): + # we are only interested in symlinks pointing to a directory + return False + + if self.is_broken_symlink(): + # cannot identify target, no way to know if link is recursive + return False + + target_dir = self.path.resolve() + tree_root = self.tree_root + + if is_path_relative_to(target_dir, tree_root) or \ + is_path_relative_to(tree_root, target_dir): + # either: + # - target dir is within `max_depth` levels beneath the tree + # root, so it will likely be yielded or has already been + # yielded (bar any exclusion filters) + # - target dir is a parent of the tree root, so we may still + # get into a loop if we recurse more than `max_depth` levels + relative_depth = abs(path_depth(target_dir, tree_root)) + return max_depth is None or \ + relative_depth <= max_depth + else: + return False class Node: From e4d041d91cee18d699d98e2e529444ea27edc4cd Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Aug 2022 12:57:57 +0200 Subject: [PATCH 104/131] Cannot use too-modern type annotation (yet) --- datalad_next/tree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 20c12f12..a8439669 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -951,7 +951,9 @@ def tree_root(self) -> Path: else self.path # we are the root @property - def parents(self) -> list[Path]: + # More accurate annotation only from PY3.9 onwards + # def parents(self) -> list[Path]: + def parents(self) -> list: """List of parent paths in top-down order beginning from the tree root. Assumes the node path to be already normalized. From d4664fdf71b99b03a303c0261773b8301f11888c Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Aug 2022 13:08:07 +0200 Subject: [PATCH 105/131] Configure ReadTheDocs to use PY3.9 to handle the type annotations --- readthedocs.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 readthedocs.yml diff --git a/readthedocs.yml b/readthedocs.yml new file mode 100644 index 00000000..c90e6e0f --- /dev/null +++ b/readthedocs.yml @@ -0,0 +1,26 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-20.04 + tools: + python: "3.9" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/source/conf.py + + +formats: all + +# Optionally declare the Python requirements required to build your docs +python: + install: + - path: . + method: pip + - requirements: requirements-devel.txt From 31cdba957528a77dde058be57c080a44d67564d7 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 15 Aug 2022 13:54:24 +0200 Subject: [PATCH 106/131] Adjust test skip condition to match problem These tests need symlinks to work, which could also be unavailable on non-Windows platforms with particular filesystems. --- datalad_next/tests/test_tree.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 617a72a3..03a653cd 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -715,6 +715,7 @@ def test_print_tree_fails_for_nonexistent_directory(self): Tree(Path(nonexistent_dir), max_depth=1) @skip_if_on_windows + @skip_wo_symlink_capability def test_print_tree_permission_denied(self, path): """ - If the tree contains a directory for which the user has no @@ -805,6 +806,7 @@ def test_tree_with_broken_symlinks(self, path, include_files): assert set(expected) == set(actual) @skip_if_on_windows + @skip_wo_symlink_capability @pytest.mark.parametrize("include_files", (True, False)) def test_tree_with_broken_symlinks_to_inaccessible_targets( self, path, include_files): From 7a66ba6fa8ccd908944f4ecbedeef3e589123f47 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:35:45 +0200 Subject: [PATCH 107/131] disable results rendering for ds.create() calls in test setup --- datalad_next/tests/test_tree.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 03a653cd..2cbccb9a 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -177,17 +177,19 @@ def path_ds(): # create datasets / repos root = temp_dir_root / "root" BasicGitTestRepo(path=root / "repo0", puke_if_exists=False) - superds0 = Dataset(root / "superds0").create(force=True) - sd0_subds0 = superds0.create("sd0_subds0", force=True) - sd0_subds0.create("sd0_sub0_subds0", force=True) - superds1 = Dataset(root / "superds1").create(force=True) - superds1.create(Path("sd1_dir0") / "sd1_d0_subds0", force=True) - Dataset(root / "superds1" / "sd1_ds0").create(force=True) + ckwa = dict(force=True, result_renderer="disabled") + superds0 = Dataset(root / "superds0").create(**ckwa) + sd0_subds0 = superds0.create("sd0_subds0", **ckwa) + sd0_subds0.create("sd0_sub0_subds0", **ckwa) + superds1 = Dataset(root / "superds1").create(**ckwa) + superds1.create(Path("sd1_dir0") / "sd1_d0_subds0", **ckwa) + Dataset(root / "superds1" / "sd1_ds0").create(**ckwa) BasicGitTestRepo( path=root / "superds1" / "sd1_dir0" / "sd1_d0_repo0", puke_if_exists=False) - sd1_subds0 = superds1.create("sd1_subds0", force=True) - sd1_subds0.drop(what='all', reckless='kill', recursive=True) + sd1_subds0 = superds1.create("sd1_subds0", **ckwa) + sd1_subds0.drop(what='all', reckless='kill', + recursive=True, result_renderer='disabled') yield temp_dir_root From 990af0ccd3504b43ec3afafb8755a1dfa7f8f9bb Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:39:27 +0200 Subject: [PATCH 108/131] remove redundant calls to path.relative_to() in path_depth() (suggestion by @mih) --- datalad_next/tree.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index a8439669..6d010a0f 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -394,15 +394,26 @@ def path_depth(path: Path, root: Path): int Number of levels of the given path *below* the root (positive integer) or *above* the tree root (negative integer) + + Raises + ------ + ValueError + Like ``path.relative_to()``, raises ``ValueError`` if the path is not + relative to the root """ - if is_path_relative_to(path, root): - return len(path.relative_to(root).parts) - elif is_path_relative_to(root, path): - return - len(root.relative_to(path).parts) - else: - raise ValueError("Could not calculate directory depth: " - f"'{path}' is not relative to the tree root " - f"'{root}' (or vice-versa)") + sign = 1 + try: + rpath = path.relative_to(root) + except ValueError: + try: + rpath = root.relative_to(path) + sign = -1 + except ValueError: + raise ValueError( + "Could not calculate directory depth: " + f"'{path}' is not relative to the tree root " + f"'{root}' (or vice-versa)") + return sign * len(rpath.parts) def is_empty_dir(path: Path): From a551e25c8d0729f8e89fdb16d80c6b4fb3877375 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:43:07 +0200 Subject: [PATCH 109/131] remove unnecessary calls to is_path_relative_to() to improve performance --- datalad_next/tree.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 6d010a0f..7f909a71 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -505,13 +505,6 @@ def res_filter(res): ) -def is_subds_of_parent(subds_path: Path, parent_path: Path): - return any( - is_path_relative_to(Path(p), subds_path) - for p in get_subds_paths(parent_path) - ) - - def get_dataset_root_datalad_only(path: Path): """Get root of dataset containing a given path (datalad datasets only, not pure git/git-annex repo) @@ -568,7 +561,7 @@ def get_superdataset(path: Path): superds = Dataset(sds_path_) # test if path is registered subdataset of the parent - if not is_subds_of_parent(path, superds.pathobj): + if not str(path) in get_subds_paths(superds.pathobj): break # That was a good candidate @@ -1060,18 +1053,19 @@ def is_recursive_symlink(self, max_depth) -> bool: target_dir = self.path.resolve() tree_root = self.tree_root - if is_path_relative_to(target_dir, tree_root) or \ - is_path_relative_to(tree_root, target_dir): - # either: - # - target dir is within `max_depth` levels beneath the tree - # root, so it will likely be yielded or has already been - # yielded (bar any exclusion filters) - # - target dir is a parent of the tree root, so we may still - # get into a loop if we recurse more than `max_depth` levels - relative_depth = abs(path_depth(target_dir, tree_root)) + # either: + # - target dir is within `max_depth` levels beneath the tree + # root, so it will likely be yielded or has already been + # yielded (bar any exclusion filters) + # - target dir is a parent of the tree root, so we may still + # get into a loop if we recurse more than `max_depth` levels + try: + rel_depth = abs(path_depth(target_dir, tree_root)) return max_depth is None or \ - relative_depth <= max_depth - else: + rel_depth <= max_depth + except ValueError: + # cannot compute path depth because target is outside + # of the tree root, so no loop is possible return False From 37281a38a249c0a9fbf1d42a7ef62f1186fa5329 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:46:13 +0200 Subject: [PATCH 110/131] get parent ds from stored visited nodes in _ds_child_node_exceeds_max_depth() to avoid recreating node objects --- datalad_next/tree.py | 52 ++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 7f909a71..a6a5c925 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -752,6 +752,8 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # secondary 'helper' generator that will traverse the whole tree # (once) and yield only datasets and their parents directories self._ds_generator = self._generate_datasets() + # keep track of node paths that have been yielded + self._visited = set([]) # current value of the ds_generator. the generator will be initialized # lazily, so for now we set the value to a dummy `_TreeNode` @@ -787,7 +789,8 @@ def exclude_func(node: _TreeNode): if isinstance(node, DatasetNode): # check if maximum dataset depth is exceeded - is_valid_ds = self._is_valid_dataset(node) + is_valid_ds = not self.exclude_node_func(node) and \ + node.ds_depth <= self.max_dataset_depth if is_valid_ds: self._advance_ds_generator() # go to next dataset(-parent) return not is_valid_ds @@ -857,9 +860,6 @@ def exclude(n: _TreeNode): exclude_node_func=exclude, ) - # keep track of node paths that have already been yielded - visited = set([]) - nodes_below_root = ds_tree.generate_nodes() next(nodes_below_root) # skip root node @@ -872,43 +872,37 @@ def exclude(n: _TreeNode): # yield parent directories if not already done parents_below_root = node.parents[1:] # first parent is root - for depth, parent in enumerate(parents_below_root): - if parent not in visited: - visited.add(parent) + for par_depth, par_path in enumerate(parents_below_root): + parent = Node(par_path, par_depth) - yield Node(parent, depth) + if parent not in self._visited: + self._visited.add(parent) + yield parent - visited.add(node.path) + self._visited.add(node) yield node - def _is_valid_dataset(self, node): - return isinstance(node, DatasetNode) and \ - is_path_relative_to(node.path, self.root) and \ - not self.exclude_node_func(node) and \ - not self._ds_exceeds_max_ds_depth(node) - - def _ds_exceeds_max_ds_depth(self, ds_node): - return ds_node.ds_depth > self.max_dataset_depth - def _ds_child_node_exceeds_max_depth(self, ds_node): - ds_parent = get_dataset_root_datalad_only(ds_node.path) - if ds_parent is None: - return True # it's not a dataset child, we exclude it + ds_parent_path = get_dataset_root_datalad_only(ds_node.path) + if ds_parent_path is None: + # it's not a dataset's child, so exclude + return True - ds_parent_depth = self.path_depth(ds_parent) - if not self._is_valid_dataset(Node(ds_parent, ds_parent_depth)): - return True # also exclude + if ds_parent_path == self.root: + ds_parent_depth = 0 + else: + ds_parent = next((node for node in self._visited + if node.path == ds_parent_path), None) + if ds_parent is None: + # parent is not part of the tree, so exclude child + return True + ds_parent_depth = ds_parent.depth # check directory depth relative to the dataset parent rel_depth = ds_node.depth - ds_parent_depth - assert rel_depth >= 0, "relative depth from parent cannot be < 0 " \ - f"(path: '{ds_node.path}', parent: '{ds_parent}')" return rel_depth > self.max_depth def _is_parent_of_ds(self, node): - if isinstance(node, FileNode): - return False # files can't be parents - if self._next_ds is None: return False # no more datasets, can't be a parent From 445fb221f7fa2e31b2bcfea03912e0af3b2c021a Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:47:35 +0200 Subject: [PATCH 111/131] remove is_dir() check to limit system calls (expensive on huge directory trees) --- datalad_next/tree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index a6a5c925..02a09590 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -417,7 +417,9 @@ def path_depth(path: Path, root: Path): def is_empty_dir(path: Path): - return path.is_dir() and not any(path.iterdir()) + """Does not check that path is a directory (to avoid extra + system calls)""" + return not any(path.iterdir()) @lru_cache From 7b1f5b1042f807686a37f0d850f8b480b385f99e Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:48:35 +0200 Subject: [PATCH 112/131] add parameter 'installed_only' to is_dataset() for skipping non-installed ds check when not needed --- datalad_next/tree.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 02a09590..efc77acf 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -423,17 +423,16 @@ def is_empty_dir(path: Path): @lru_cache -def is_dataset(path: Path): +def is_dataset(path: Path, installed_only=False): """Fast dataset detection. Infer that a directory is a dataset if it is either: - installed, or - - not installed, but has an installed superdatset. + - not installed, but has an installed superdatset (only if argument + ``installed_only`` is False) - Only consider datalad datasets, not plain git/git-annex repos. Datasets - used for aggregating metadatata from subdatasets are also counted as - datasets, although they do not have a dataset ID themselves. + Only consider datalad datasets, not plain git/git-annex repos. Symlinks pointing to datasets are not resolved, so will always return False for symlinks. This prevents potentially detecting duplicate datasets @@ -442,13 +441,13 @@ def is_dataset(path: Path): Results are cached because the check is somewhat expensive and may be run multiple times on the same path. - TODO: is there a way to detect a datalad dataset if it is not installed - and it is not a subdataset? - Parameters ---------- path: Path Path to directory to be identified as dataset or non-dataset + + installed_only: bool + Whether to ignore datasets that are not installed """ try: if path.is_symlink(): @@ -468,9 +467,9 @@ def is_dataset(path: Path): # instead of querying ds.is_installed() (which checks if the # directory has the .git folder), we check if the directory # is empty (faster) -- as e.g. after a non-recursive `datalad clone` - if is_empty_dir(path): - if get_superdataset(path) is not None: - return True + if not installed_only: + if is_empty_dir(path): + return get_superdataset(path) is not None except Exception as ex: # if anything fails (e.g. permission denied), we raise exception @@ -528,7 +527,7 @@ def get_dataset_root_datalad_only(path: Path): return None # we are not inside a dataset potential_ds_root = Path(potential_ds_root) - if is_dataset(potential_ds_root): + if is_dataset(potential_ds_root, installed_only=True): return potential_ds_root # it's a match # we go one directory higher and try again From af014207a2e2011e8e1dbe869dd359582d5a771b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:50:26 +0200 Subject: [PATCH 113/131] cache results of get_dataset_root_datalad_only() (suggested by @mih) --- datalad_next/tree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index efc77acf..b53840af 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -506,6 +506,7 @@ def res_filter(res): ) +@lru_cache def get_dataset_root_datalad_only(path: Path): """Get root of dataset containing a given path (datalad datasets only, not pure git/git-annex repo) From 2311a1ea572ae655cf3c236242d0c498ea848832 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:51:33 +0200 Subject: [PATCH 114/131] skip unnecessary input validation in Node() constructor --- datalad_next/tree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index b53840af..5ed38b7c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -1072,6 +1072,9 @@ class Node: and handles any exceptions (permission errors, broken symlinks, etc.) """ def __new__(cls, path: Path, depth: int, **kwargs): + if not isinstance(path, Path): + raise ValueError("path must be a Path object") + node_cls = FileNode captured_ex = None try: From 070953dba2f7478c17c922a2d530e38512a6af48 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Tue, 16 Aug 2022 01:53:44 +0200 Subject: [PATCH 115/131] do not count datasets that are only metadata aggregators (because of legacy config system) (suggested by @mih) - reverts 6a047f --- datalad_next/tree.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 5ed38b7c..3183dd84 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -457,8 +457,7 @@ def is_dataset(path: Path, installed_only=False): f"dataset: '{path}'") return False - if (path / ".datalad" / "config").is_file() or \ - (path / ".datalad" / "metadata").is_dir(): + if (path / ".datalad" / "config").is_file(): # could also query `ds.id`, but checking just for existence # of config file is quicker. return True From 744caaae15500a88ef9af0b1ec5dfbc93e39abea Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 17 Aug 2022 00:01:40 +0200 Subject: [PATCH 116/131] update tree command docs --- datalad_next/tree.py | 163 ++++++++++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 39 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 3183dd84..f66c1eb0 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -51,38 +51,117 @@ class TreeCommand(Interface): """Visualize directory and dataset hierarchies - This command mimics the UNIX/MSDOS ``tree`` utility to display a directory - tree, highlighting DataLad datasets in the hierarchy. + This command mimics the UNIX/MSDOS 'tree' utility to generate and + display a directory tree, with DataLad-specific enhancements. - Two main use cases are covered: + It can serve the following purposes: - 1. Glorified ``tree`` command: + 1. Glorified 'tree' command + 2. Dataset discovery + 3. Programmatic directory traversal - Display the contents of a directory tree and see which directories are - datalad datasets (including subdatasets that are present but not - installed, such as after a non-recursive clone). + *Glorified 'tree' command* - This is basically just ``tree`` with visual markers for datasets. In - addition to it, ``datalad-tree`` provides the following: + The rendered command output uses 'tree'-style visualization:: - - The subdataset hierarchy level is shown in the dataset marker - (e.g. [DS~2]). This is the absolute level, meaning it may also take - into account superdatasets located above the tree root and thus - not included in the output. - - The 'report line' at the bottom of the output shows the count of - displayed datasets, in addition to the count of directories and - files. + /tmp/mydir + ├── [DS~0] ds_A/ + │ └── [DS~1] subds_A/ + └── [DS~0] ds_B/ + ├── dir_B/ + │ ├── file.txt + │ ├── subdir_B/ + │ └── [DS~1] subds_B0/ + └── [DS~1] (not installed) subds_B1/ - 2. Descriptor of nested subdataset hierarchies: + 5 datasets, 2 directories, 1 file - Display the structure of multiple datasets and their hierarchies based - on subdataset nesting level, regardless of their location in the - directory tree. + Dataset paths are prefixed by a marker indicating subdataset hierarchy + level, like ``[DS~1]``. + This is the absolute subdataset level, meaning it may also take into + account superdatasets located above the tree root and thus not included + in the output. + If a subdataset is registered but not installed (such as after a + non-recursive ``datalad clone``), it will be prefixed by ``(not + installed)``. Only DataLad datasets are considered, not pure + git/git-annex repositories. + + The 'report line' at the bottom of the output shows the count of + displayed datasets, in addition to the count of directories and + files. In this context, datasets and directories are mutually + exclusive categories. + + By default, only directories (no files) are included in the tree, + and hidden directories are skipped. Both behaviours can be changed + using command options. + + Symbolic links are always followed. + This means that a symlink pointing to a directory is traversed and + counted as a directory (unless it potentially creates a loop in + the tree). + + *Dataset discovery* + + Using the [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY] + option, this command generates the layout of dataset hierarchies based on + subdataset nesting level, regardless of their location in the + filesystem. + + In this case, tree depth is determined by subdataset depth. This mode + is therefore suited for discovering available datasets when their + location is not known in advance. + + By default, only datasets are listed, without their contents. If + [CMD: ``--depth`` CMD][PY: ``depth`` PY] is specified additionally, + the contents of each dataset will be included up to [CMD: + ``--depth`` CMD][PY: ``depth`` PY] directory levels. + + Tree filtering options such as [CMD: ``--include-hidden`` CMD][PY: + ``include_hidden`` PY] only affect which directories are + reported/displayed, not which directories are traversed to find datasets. + + *Programmatic directory traversal* + + The command yields a result record for each tree node (dataset, + directory or file). The following properties are reported, where available: + + "path" + Absolute path of the tree node + + "type" + Type of tree node: "dataset", "directory" or "file" + + "depth" + Directory depth of node relative to the tree root + + "exhausted_levels" + Depth levels for which no nodes are left to be generated (the + respective subtrees have been 'exhausted') + + "count" + Dict with cumulative counts of datasets, directories and files in the + tree up until the current node. File count is only included if the + command is run with the [CMD: ``--include-files`` CMD][PY: + ``include_files`` PY] + option. + + "dataset_depth" + Subdataset depth level relative to the tree root. Only included for + node type "dataset". + + "dataset_abs_depth" + Absolute subdataset depth level. Only included for node type "dataset". + + "dataset_is_installed" + Whether the registered subdataset is installed. Only included for node + type "dataset". + + "symlink_target" + If the tree node is a symlink, the path to the link target + + "is_broken_symlink" + If the tree node is a symlink, whether it is a broken symlink - In this case, the tree depth is determined by subdataset depth. - There is also the option to display contents (directories/files) of - each dataset up to max_depth levels, to provide better context around - the datasets. """ result_renderer = 'tailored' @@ -98,13 +177,15 @@ class TreeCommand(Interface): args=("--depth",), doc="""maximum level of directory tree to display. If not specified, will display all levels. - If paired with [CMD: --dataset-depth CMD][PY: dataset_depth PY], - refers to the maximum directory level to display underneath each - dataset.""", + If paired with [CMD: ``--dataset-depth`` CMD][PY: + ``dataset_depth`` PY], refers to the maximum directory level to + display underneath each dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("--dataset-depth",), - doc="""maximum level of nested subdatasets to display""", + doc="""maximum level of nested subdatasets to display. 0 means + only top-level datasets, 1 means top-level datasets and their + immediate subdatasets, etc.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( args=("--include-files",), @@ -112,22 +193,26 @@ class TreeCommand(Interface): action='store_true'), include_hidden=Parameter( args=("--include-hidden",), - doc="""include hidden files/directories in output display""", + doc="""include hidden files/directories in output display. This + option does not affect which directories will be searched for + datasets when specifying [CMD: ``--dataset-depth`` CMD][PY: + ``dataset_depth`` PY]. For example, datasets located underneath + the hidden folder `.datalad` will be reported even if [CMD: + ``--include-hidden`` CMD][PY: ``include_hidden`` PY] is omitted.""", action='store_true'), ) _examples_ = [ - dict(text="Display up to 3 levels of the current directory's " - "subdirectories and their contents", - code_py="tree(depth=3, include_files=True)", - code_cmd="datalad tree --depth 3 --include-files"), - dict(text="Display all first- and second-level subdatasets of " - "datasets located anywhere under /tmp (including in hidden " - "directories) regardless of directory depth", - code_py="tree('/tmp', dataset_depth=2, include_hidden=True)", - code_cmd="datalad tree /tmp --dataset-depth 2 --include-hidden"), + dict(text="Display up to 3 levels of subdirectories below the current " + "directory, including files and hidden contents", + code_py="tree(depth=3, include_files=True, include_hidden=True)", + code_cmd="datalad tree --depth 3 --include-files --include-hidden"), + dict(text="Find all top-level datasets located anywhere under ``/tmp``", + code_py="tree('/tmp', dataset_depth=0)", + code_cmd="datalad tree /tmp --dataset-depth 0"), dict(text="Display first- and second-level subdatasets and their " - "contents, up to 1 directory deep within each dataset", + "directory contents, up to 1 subdirectory deep within each " + "dataset", code_py="tree(dataset_depth=2, depth=1)", code_cmd="datalad tree --dataset-depth 2 --depth 1"), ] From e32cf5843990769b49d0b1715cbe9c0fce82c4f5 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 17 Aug 2022 00:14:58 +0200 Subject: [PATCH 117/131] reword args descriptions in tree command docs --- datalad_next/tree.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index f66c1eb0..4ab57474 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -175,25 +175,26 @@ class TreeCommand(Interface): constraints=EnsureStr() | EnsureNone()), depth=Parameter( args=("--depth",), - doc="""maximum level of directory tree to display. - If not specified, will display all levels. + doc="""maximum level of subdirectories to include in the tree. + If not specified, will generate the full tree with no depth + constraint. If paired with [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY], refers to the maximum directory level to - display underneath each dataset.""", + generate underneath each dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("--dataset-depth",), - doc="""maximum level of nested subdatasets to display. 0 means - only top-level datasets, 1 means top-level datasets and their - immediate subdatasets, etc.""", + doc="""maximum level of nested subdatasets to include in the + tree. 0 means only top-level datasets, 1 means top-level + datasets and their immediate subdatasets, etc.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( args=("--include-files",), - doc="""include files in output display""", + doc="""include files in the tree""", action='store_true'), include_hidden=Parameter( args=("--include-hidden",), - doc="""include hidden files/directories in output display. This + doc="""include hidden files/directories in the tree. This option does not affect which directories will be searched for datasets when specifying [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY]. For example, datasets located underneath @@ -203,14 +204,14 @@ class TreeCommand(Interface): ) _examples_ = [ - dict(text="Display up to 3 levels of subdirectories below the current " + dict(text="Show up to 3 levels of subdirectories below the current " "directory, including files and hidden contents", code_py="tree(depth=3, include_files=True, include_hidden=True)", code_cmd="datalad tree --depth 3 --include-files --include-hidden"), dict(text="Find all top-level datasets located anywhere under ``/tmp``", code_py="tree('/tmp', dataset_depth=0)", code_cmd="datalad tree /tmp --dataset-depth 0"), - dict(text="Display first- and second-level subdatasets and their " + dict(text="Report first- and second-level subdatasets and their " "directory contents, up to 1 subdirectory deep within each " "dataset", code_py="tree(dataset_depth=2, depth=1)", From 8510dd678b911282f40d7a0406a24c2637a2348b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Wed, 17 Aug 2022 20:09:48 +0200 Subject: [PATCH 118/131] fix lru_cache decorator syntax for python 3.7 compatibility --- datalad_next/tree.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 4ab57474..ad16251a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -508,7 +508,7 @@ def is_empty_dir(path: Path): return not any(path.iterdir()) -@lru_cache +@lru_cache() def is_dataset(path: Path, installed_only=False): """Fast dataset detection. @@ -566,7 +566,7 @@ def is_dataset(path: Path, installed_only=False): return False -@lru_cache +@lru_cache() def get_subds_paths(ds_path: Path): """Return paths of immediate subdatasets for a given dataset path.""" # This is an expensive operation because it calls git to read the @@ -591,7 +591,7 @@ def res_filter(res): ) -@lru_cache +@lru_cache() def get_dataset_root_datalad_only(path: Path): """Get root of dataset containing a given path (datalad datasets only, not pure git/git-annex repo) @@ -621,7 +621,7 @@ def get_dataset_root_datalad_only(path: Path): return ds_root -@lru_cache +@lru_cache() def get_superdataset(path: Path): """Reimplementation of ``Dataset.get_superdataset()`` to allow caching results of `ds.subdatasets()` (the most expensive operation). @@ -1222,7 +1222,7 @@ def __init__(self, *args, **kwargs): # only if exception has not already been passed to constructor self.exception = CapturedException(ex, level=10) - @lru_cache + @lru_cache() def calculate_dataset_depth(self): """ Calculate 2 measures of a dataset's nesting depth/level: From c64ff435e383bf70121e99b04abc240b6778debe Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 13:12:25 +0200 Subject: [PATCH 119/131] add test for dataset tree with resulting directory depth that exceeds the depth of deepest dataset --- datalad_next/tests/test_tree.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 2cbccb9a..662dddd5 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -630,6 +630,18 @@ class TestDatasetTree(TestTree): └── [DS~0] superds1/ ├── sd1_dir0/ └── [DS~0] sd1_ds0/ +""" + }, + { + "dataset_depth": 0, + "depth": 2, + "expected_stats_str": "3 datasets, 2 directories", + "expected_str": """ +├── [DS~0] superds0/ +└── [DS~0] superds1/ + ├── sd1_dir0/ + │ └── sd1_d0_repo0/ + └── [DS~0] sd1_ds0/ """ }, { From cebaf6081c16bfa45990ad6e6b2e55c99f0798e0 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 13:12:46 +0200 Subject: [PATCH 120/131] add test for dataset tree when there are no datasets --- datalad_next/tests/test_tree.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 662dddd5..12fc5407 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -702,6 +702,24 @@ def test_print_tree( ui.message(actual_res) assert_str_equal(expected_res, actual_res) + def test_print_tree_without_datasets(self): + """If there are no datasets, should only print the root""" + root = str(self.path / "root" / "repo0") + command = [ + 'tree', + root, + '--depth', '10', + '--dataset-depth', '10', + '--include-files' + ] + _, actual_res, _ = get_tree_rendered_output(command) + expected_res = "" + ui.message("expected:") + ui.message(expected_res) + ui.message("actual:") + ui.message(actual_res) + assert_str_equal(expected_res, actual_res) + def test_print_stats( self, dataset_depth, depth, expected_stats_str ): From 26a35cfac8a8cf881192365b38d8e9a31bd1b534 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 13:14:14 +0200 Subject: [PATCH 121/131] fix formatting of multiple imports --- datalad_next/tree.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index ad16251a..72e93a5d 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -11,7 +11,10 @@ __docformat__ = "numpy" import logging -from functools import wraps, lru_cache +from functools import ( + wraps, + lru_cache +) from os import readlink from pathlib import Path From b3d1ac37c5d16a4b82513606dbfb82298982a7a9 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 13:29:05 +0200 Subject: [PATCH 122/131] add note on performance of --dataset-depth option --- datalad_next/tree.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 72e93a5d..7bad3f6a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -123,6 +123,13 @@ class TreeCommand(Interface): ``include_hidden`` PY] only affect which directories are reported/displayed, not which directories are traversed to find datasets. + **Performance note**: since no assumption is made on the location of + datasets, running this command with the [CMD: ``--dataset-depth`` CMD][PY: + ``dataset_depth`` PY] option does a full scan of the whole directory + tree. As such, it can be significantly slower than a call with an + equivalent output that uses [CMD: ``--depth`` CMD][PY: ``depth`` PY] to + limit the tree instead. + *Programmatic directory traversal* The command yields a result record for each tree node (dataset, @@ -189,7 +196,8 @@ class TreeCommand(Interface): args=("--dataset-depth",), doc="""maximum level of nested subdatasets to include in the tree. 0 means only top-level datasets, 1 means top-level - datasets and their immediate subdatasets, etc.""", + datasets and their immediate subdatasets, etc. *Note*: may be + slow on large directory trees.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( args=("--include-files",), From bea50e3271cd472c1f6a8a2c2ff1aeb746e86e54 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 14:03:11 +0200 Subject: [PATCH 123/131] reword docstrings / log messages --- datalad_next/tree.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 7bad3f6a..8b3cec2a 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -117,11 +117,13 @@ class TreeCommand(Interface): By default, only datasets are listed, without their contents. If [CMD: ``--depth`` CMD][PY: ``depth`` PY] is specified additionally, the contents of each dataset will be included up to [CMD: - ``--depth`` CMD][PY: ``depth`` PY] directory levels. + ``--depth`` CMD][PY: ``depth`` PY] directory levels (excluding + subdirectories that are themselves datasets). Tree filtering options such as [CMD: ``--include-hidden`` CMD][PY: ``include_hidden`` PY] only affect which directories are - reported/displayed, not which directories are traversed to find datasets. + reported as dataset contents, not which directories are traversed to find + datasets. **Performance note**: since no assumption is made on the location of datasets, running this command with the [CMD: ``--dataset-depth`` CMD][PY: @@ -188,9 +190,10 @@ class TreeCommand(Interface): doc="""maximum level of subdirectories to include in the tree. If not specified, will generate the full tree with no depth constraint. - If paired with [CMD: ``--dataset-depth`` CMD][PY: - ``dataset_depth`` PY], refers to the maximum directory level to - generate underneath each dataset.""", + If paired with + [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY], + refers to the maximum directory level to generate underneath + each dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), dataset_depth=Parameter( args=("--dataset-depth",), @@ -311,11 +314,13 @@ def __call__( @staticmethod def custom_result_renderer(res, **kwargs): """ - Each node is printed on one line. The string uses the format: - ``[] [] [] [] [] + + Example line:: + + │ │ ├── path_dir_level3 """ from datalad.support import ansi_colors @@ -759,8 +764,9 @@ def _generate_tree_nodes(self, dir_path: Path): current_node.is_recursive_symlink(self.max_depth): # if symlink points to directory that we may visit or may # have visited already, do not recurse into it - lgr.debug(f"Symlink is potentially recursive, " - f"will not traverse target directory: '{dir_path}'") + lgr.debug("Symlink is potentially recursive, " + "will not traverse target directory: " + f"{dir_path} -> {current_node.get_symlink_target()}") return if current_node.exception is not None: From 0b7808d503339e58e09485b60e31b512150ae211 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 14:08:17 +0200 Subject: [PATCH 124/131] cast Tree root to Path object in constructor --- datalad_next/tree.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 8b3cec2a..8b2b5677 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -704,8 +704,9 @@ def __init__(self, exclude_node_func: Callable or None Function to filter out tree nodes from the tree """ - self.root = root.resolve(strict=False) try: + root = Path(root) + self.root = root.resolve(strict=False) assert self.root.is_dir(), f"path is not a directory: {self.root}" except (AssertionError, OSError) as ex: # could be permission error raise ValueError(f"directory not found: '{root}'") from ex From ca689beb1e614e4a0413f7af4050d501c84bd837 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 14:09:17 +0200 Subject: [PATCH 125/131] add '__repr__' methods to classes --- datalad_next/tree.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 8b2b5677..42b89a81 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -733,6 +733,10 @@ def __init__(self, self.node_count = {node_type.__name__: 0 for node_type in _TreeNode.__subclasses__()} + def __repr__(self): + return self.__class__.__name__ + \ + f"('{self.root}', max_depth={self.max_depth})" + @staticmethod def default_exclude_func(node): """By default, exclude files and hidden directories from the tree""" @@ -1046,6 +1050,9 @@ def __eq__(self, other): def __hash__(self): return hash(str(self.path)) + def __repr__(self): + return f"{self.__class__.__name__}('{self.path}', depth={self.depth})" + @property def tree_root(self) -> Path: """Calculate tree root path from node path and depth""" From d290d4169707baf582c305b9e015fa5602875f5d Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 14:13:41 +0200 Subject: [PATCH 126/131] rewording in docstrings --- datalad_next/tree.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 42b89a81..951f4c6b 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -842,22 +842,24 @@ def generate_nodes(self): class DatasetTree(Tree): """ - ``DatasetTree`` is a ``Tree`` whose depth is determined by the - subdataset hierarchy level, instead of directory depth. + ``DatasetTree`` is a ``Tree`` whose depth is determined primarily + by the subdataset hierarchy level (parameter ``max_dataset_depth``). + + Here, ``max_depth`` can also be specified, but it refers to the + depth of each dataset's content. If this depth is 0, only datasets + are reported, without any files or subdirectories underneath. Because of the different semantics of the ``max_depth`` parameter, - we implement a separate subclass of ``Tree``. + this class is implemented as a separate subclass of ``Tree``. """ def __init__(self, *args, max_dataset_depth=0, **kwargs): super().__init__(*args, **kwargs) - # by default, do not recurse into datasets' subdirectories (other - # than paths to nested subdatasets) + self.max_dataset_depth = max_dataset_depth if self.max_depth is None: + # by default, do not include datasets' contents self.max_depth = 0 - self.max_dataset_depth = max_dataset_depth - # secondary 'helper' generator that will traverse the whole tree # (once) and yield only datasets and their parents directories self._ds_generator = self._generate_datasets() @@ -958,8 +960,8 @@ def _generate_datasets(self): Generator[DirectoryNode or DatasetNode] """ - def exclude(n: _TreeNode): - # we won't find any datasets underneath the git folder + def is_excluded(n: _TreeNode): + # assumption: we won't find datasets underneath the git folder return isinstance(n, FileNode) or \ (isinstance(n, DirectoryNode) and n.path.name == ".git") From 79732a7febb3a7c34df9f9cf96ca80ccf8933a1d Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 14:27:30 +0200 Subject: [PATCH 127/131] compute whole dataset tree upfront instead of yielding in tandem with main tree --- datalad_next/tree.py | 192 ++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 113 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 951f4c6b..da1769ea 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -860,100 +860,50 @@ def __init__(self, *args, max_dataset_depth=0, **kwargs): # by default, do not include datasets' contents self.max_depth = 0 - # secondary 'helper' generator that will traverse the whole tree - # (once) and yield only datasets and their parents directories - self._ds_generator = self._generate_datasets() - # keep track of node paths that have been yielded - self._visited = set([]) - - # current value of the ds_generator. the generator will be initialized - # lazily, so for now we set the value to a dummy `_TreeNode` - # with an impossible depth just to distinguish it from None - # (None means the generator has finished). - self._next_ds = _TreeNode(self.root, None) + # lazy initialization of list of datasets and their parents, + # will be computed when generating nodes for the first time + self.ds_nodes = [] + + def __repr__(self): + return self.__class__.__name__ + \ + f"('{self.root}', " \ + f"max_dataset_depth={self.max_dataset_depth}, " \ + f"max_depth={self.max_depth})" @increment_node_count def generate_nodes(self): - """ - Yield ``_TreeNode`` objects that belong to the tree. - - A ``DatasetTree`` is just an unlimited-depth ``Tree`` with more - complex rules for pruning (skipping traversal of particular nodes). - Each exclusion rule is encoded in a function. The rules are then - combined in a final ``exclusion_func`` which is supplied to the - ``Tree`` constructor. - - Returns - ------- - Generator[_TreeNode] - """ - - def exclude_func(node: _TreeNode): - """Exclusion function -- here is the crux of the logic for - pruning the main tree.""" - - try: - # initialize dataset(-parent) generator if not done yet - if self._next_ds is not None and \ - self._next_ds.depth is None: # dummy depth - self._advance_ds_generator() - - if isinstance(node, DatasetNode): - # check if maximum dataset depth is exceeded - is_valid_ds = not self.exclude_node_func(node) and \ - node.ds_depth <= self.max_dataset_depth - if is_valid_ds: - self._advance_ds_generator() # go to next dataset(-parent) - return not is_valid_ds - - # exclude file or directory underneath a dataset, - # if it has depth (relative to dataset root) > max_depth, - # unless (in case of a directory) it is itself the parent of a - # valid dataset. if it's a parent of a dataset, we don't apply - # any filters -- it's just a means to get to the next dataset. - if not self._is_parent_of_ds(node): - return self.exclude_node_func(node) or \ - self._ds_child_node_exceeds_max_depth(node) - - except Exception as ex: - CapturedException(ex, level=10) # DEBUG level - lgr.debug(f"Excluding node from tree because " - "an exception occurred while applying the " - f"exclusion filter: '{node.path}'") - return True # exclude by default - - return False # do not exclude + # compute full list of dataset nodes and their parents upfront. + # this requires an unlimited-depth tree traversal, so will + # be the slowest operation + if not self.ds_nodes: + lgr.debug("Started computing dataset nodes for " + repr(self)) + self.ds_nodes = list(self.generate_dataset_nodes()) + lgr.debug("Finished computing dataset nodes for " + repr(self)) + + if not self.ds_nodes: + depth = 0 # no datasets to report on, just yield the root + else: + depth = max(node.depth for node in self.ds_nodes) + \ + self.max_depth # max levels below the deepest dataset tree = Tree( self.root, - max_depth=None, # unlimited traversal (datasets could be anywhere) - exclude_node_func=exclude_func, + max_depth=depth, + exclude_node_func=self.exclude_func, ) # synchronize exhausted levels with the main tree self.exhausted_levels = tree.exhausted_levels yield from tree.generate_nodes() - def _advance_ds_generator(self): - """Go to the next dataset or parent of dataset""" - self._next_ds = next(self._ds_generator, None) - if self._next_ds is not None: - lgr.debug( - f"Next dataset" + - (" parent" if isinstance(self._next_ds, DirectoryNode) else "") - + f": {self._next_ds.path}") - - def _generate_datasets(self): - """Generator of dataset nodes and their parent directories starting + def generate_dataset_nodes(self): + """ + Generator of dataset nodes and their parent directories starting from below the tree root and up to ``max_dataset_depth`` levels. - This secondary 'helper' tree will be generated in parallel with the - main tree but will be one step ahead, such that it always points to - the next dataset (or dataset parent) relative to the current node in - the main tree. - - We can use it to look into downstream/future nodes and decide - efficiently whether to prune the current node in the main tree. + The assumption is that (super)datasets could be located at any level + of the directory tree. Therefore, this function does a full-depth + tree traversal to discover datasets. Returns ------- @@ -965,12 +915,15 @@ def is_excluded(n: _TreeNode): return isinstance(n, FileNode) or \ (isinstance(n, DirectoryNode) and n.path.name == ".git") + # keep track of traversed nodes + # (needed to prevent yielding duplicates) + visited = set([]) + ds_tree = Tree( self.root, - max_depth=None, - exclude_node_func=exclude, + max_depth=None, # unlimited depth, datasets could be anywhere + exclude_node_func=is_excluded, ) - nodes_below_root = ds_tree.generate_nodes() next(nodes_below_root) # skip root node @@ -986,43 +939,56 @@ def is_excluded(n: _TreeNode): for par_depth, par_path in enumerate(parents_below_root): parent = Node(par_path, par_depth) - if parent not in self._visited: - self._visited.add(parent) + if parent not in visited: + visited.add(parent) yield parent - self._visited.add(node) + visited.add(node) yield node - def _ds_child_node_exceeds_max_depth(self, ds_node): - ds_parent_path = get_dataset_root_datalad_only(ds_node.path) - if ds_parent_path is None: - # it's not a dataset's child, so exclude - return True - - if ds_parent_path == self.root: - ds_parent_depth = 0 - else: - ds_parent = next((node for node in self._visited - if node.path == ds_parent_path), None) - if ds_parent is None: - # parent is not part of the tree, so exclude child - return True - ds_parent_depth = ds_parent.depth + def exclude_func(self, node): + """Exclusion function for pruning the main tree""" + include, exclude = False, True # prevent headaches - # check directory depth relative to the dataset parent - rel_depth = ds_node.depth - ds_parent_depth - return rel_depth > self.max_depth - - def _is_parent_of_ds(self, node): - if self._next_ds is None: - return False # no more datasets, can't be a parent + try: + if node in self.ds_nodes: + # we hit a dataset or the parent of a dataset + return include + + # if `max_depth` is specified for returning dataset contents, + # exclude non-dataset nodes below a dataset that have + # depth (relative to parent dataset) > max_depth + if self.max_depth > 0 and \ + not isinstance(node, DatasetNode): + + # check that node is the child of a dataset + ds_parent = self._find_closest_ds_parent(node) + if ds_parent is not None: + rel_depth = node.depth - ds_parent.depth + exceeds_max_depth = rel_depth > self.max_depth + # also filter by the user-supplied + # exclusion logic in `exclude_node_func` + return exceeds_max_depth or \ + self.exclude_node_func(node) - if self._next_ds.path == node.path: - # we hit a dataset or the parent of a dataset - self._advance_ds_generator() - return True + except Exception as ex: + CapturedException(ex, level=10) # DEBUG level + lgr.debug(f"Excluding node from tree because " + "an exception occurred while applying the " + f"exclusion filter: '{node.path}'") + + return exclude # exclude by default + + def _find_closest_ds_parent(self, node): + ds_parent = None + for parent_path in node.path.parents: # bottom-up order + ds_parent = next((n for n in self.ds_nodes + if n.path == parent_path and + isinstance(n, DatasetNode)), None) + if ds_parent is not None: + break - return False + return ds_parent class _TreeNode: From 1a02df5202101c7b26c0778a91baa42711143598 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Sat, 20 Aug 2022 18:19:40 +0200 Subject: [PATCH 128/131] use %-string formatting for log messages (evaluated only if log is emitted) (suggested by @mih) --- datalad_next/tree.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/datalad_next/tree.py b/datalad_next/tree.py index da1769ea..29138aba 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -556,7 +556,7 @@ def is_dataset(path: Path, installed_only=False): # ignore symlinks even if pointing to datasets, otherwise we may # get duplicate counts of datasets lgr.debug("Path is a symlink, will not check if it points to a " - f"dataset: '{path}'") + "dataset: %s", path) return False if (path / ".datalad" / "config").is_file(): @@ -770,15 +770,14 @@ def _generate_tree_nodes(self, dir_path: Path): # if symlink points to directory that we may visit or may # have visited already, do not recurse into it lgr.debug("Symlink is potentially recursive, " - "will not traverse target directory: " - f"{dir_path} -> {current_node.get_symlink_target()}") + "will not traverse target directory: %s", dir_path) return if current_node.exception is not None: # if some exception occurred when instantiating the node # (missing permissions etc), do not recurse into directory lgr.debug("Node has exception, will not traverse directory: " - f"path={current_node.path}, exc={current_node.exception}") + "%r", current_node) return # sort child nodes alphabetically @@ -876,9 +875,9 @@ def generate_nodes(self): # this requires an unlimited-depth tree traversal, so will # be the slowest operation if not self.ds_nodes: - lgr.debug("Started computing dataset nodes for " + repr(self)) + lgr.debug("Started computing dataset nodes for %r", self) self.ds_nodes = list(self.generate_dataset_nodes()) - lgr.debug("Finished computing dataset nodes for " + repr(self)) + lgr.debug("Finished computing dataset nodes for %r", self) if not self.ds_nodes: depth = 0 # no datasets to report on, just yield the root @@ -973,9 +972,9 @@ def exclude_func(self, node): except Exception as ex: CapturedException(ex, level=10) # DEBUG level - lgr.debug(f"Excluding node from tree because " - "an exception occurred while applying the " - f"exclusion filter: '{node.path}'") + lgr.debug("Excluding node from tree because " + "an exception occurred while applying " + "exclusion filter: %r", node) return exclude # exclude by default From 90ff0e96bd32983e6b2160f5e29545d9b864a05b Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 22 Aug 2022 23:11:45 +0200 Subject: [PATCH 129/131] rename option --dataset-depth to --recursion-limit and add short form --- datalad_next/tests/test_tree.py | 6 ++--- datalad_next/tree.py | 39 +++++++++++++++++---------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 12fc5407..786a26ab 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -692,7 +692,7 @@ def test_print_tree( 'tree', root, '--depth', str(depth), - '--dataset-depth', str(dataset_depth) + '--recursion-limit', str(dataset_depth) ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline @@ -709,7 +709,7 @@ def test_print_tree_without_datasets(self): 'tree', root, '--depth', '10', - '--dataset-depth', '10', + '--recursion-limit', '10', '--include-files' ] _, actual_res, _ = get_tree_rendered_output(command) @@ -728,7 +728,7 @@ def test_print_stats( 'tree', root, '--depth', str(depth), - '--dataset-depth', str(dataset_depth) + '--recursion-limit', str(dataset_depth) ] _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 29138aba..2a38221d 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -105,7 +105,7 @@ class TreeCommand(Interface): *Dataset discovery* - Using the [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY] + Using the [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] option, this command generates the layout of dataset hierarchies based on subdataset nesting level, regardless of their location in the filesystem. @@ -126,8 +126,8 @@ class TreeCommand(Interface): datasets. **Performance note**: since no assumption is made on the location of - datasets, running this command with the [CMD: ``--dataset-depth`` CMD][PY: - ``dataset_depth`` PY] option does a full scan of the whole directory + datasets, running this command with the [CMD: ``--recursion-limit`` CMD][PY: + ``recursion_limit`` PY] option does a full scan of the whole directory tree. As such, it can be significantly slower than a call with an equivalent output that uses [CMD: ``--depth`` CMD][PY: ``depth`` PY] to limit the tree instead. @@ -186,21 +186,22 @@ class TreeCommand(Interface): Defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), depth=Parameter( - args=("--depth",), + args=("-L", "--depth",), doc="""maximum level of subdirectories to include in the tree. If not specified, will generate the full tree with no depth constraint. If paired with - [CMD: ``--dataset-depth`` CMD][PY: ``dataset_depth`` PY], + [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY], refers to the maximum directory level to generate underneath each dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), - dataset_depth=Parameter( - args=("--dataset-depth",), + recursion_limit=Parameter( + args=("-R", "--recursion-limit",), + metavar="LEVELS", doc="""maximum level of nested subdatasets to include in the tree. 0 means only top-level datasets, 1 means top-level - datasets and their immediate subdatasets, etc. *Note*: may be - slow on large directory trees.""", + datasets and their immediate subdatasets, etc. *Note*: may have + slow performance on large directory trees.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( args=("--include-files",), @@ -210,8 +211,8 @@ class TreeCommand(Interface): args=("--include-hidden",), doc="""include hidden files/directories in the tree. This option does not affect which directories will be searched for - datasets when specifying [CMD: ``--dataset-depth`` CMD][PY: - ``dataset_depth`` PY]. For example, datasets located underneath + datasets when specifying [CMD: ``--recursion-limit`` CMD][PY: + ``recursion_limit`` PY]. For example, datasets located underneath the hidden folder `.datalad` will be reported even if [CMD: ``--include-hidden`` CMD][PY: ``include_hidden`` PY] is omitted.""", action='store_true'), @@ -221,15 +222,15 @@ class TreeCommand(Interface): dict(text="Show up to 3 levels of subdirectories below the current " "directory, including files and hidden contents", code_py="tree(depth=3, include_files=True, include_hidden=True)", - code_cmd="datalad tree --depth 3 --include-files --include-hidden"), + code_cmd="datalad tree -L 3 --include-files --include-hidden"), dict(text="Find all top-level datasets located anywhere under ``/tmp``", - code_py="tree('/tmp', dataset_depth=0)", - code_cmd="datalad tree /tmp --dataset-depth 0"), + code_py="tree('/tmp', recursion_limit=0)", + code_cmd="datalad tree /tmp -R 0"), dict(text="Report first- and second-level subdatasets and their " "directory contents, up to 1 subdirectory deep within each " "dataset", - code_py="tree(dataset_depth=2, depth=1)", - code_cmd="datalad tree --dataset-depth 2 --depth 1"), + code_py="tree(recursion_limit=2, depth=1)", + code_cmd="datalad tree -R 2 -L 1"), ] @staticmethod @@ -239,14 +240,14 @@ def __call__( path='.', *, depth=None, - dataset_depth=None, + recursion_limit=None, include_files=False, include_hidden=False): - if dataset_depth is not None: + if recursion_limit is not None: # special tree defined by subdataset nesting depth tree_cls = DatasetTree - dataset_tree_args = {"max_dataset_depth": dataset_depth} + dataset_tree_args = {"max_dataset_depth": recursion_limit} else: # simple tree defined by directory depth tree_cls = Tree From a864693970ca1397487d3b9c1c5fd42c818f9028 Mon Sep 17 00:00:00 2001 From: Caterina Trainito Date: Mon, 22 Aug 2022 23:57:25 +0200 Subject: [PATCH 130/131] add option --recursive for unlimited-depth dataset tree --- datalad_next/tests/test_tree.py | 47 +++++++++++++++++++++++++++++--- datalad_next/tree.py | 48 ++++++++++++++++++++------------- 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/datalad_next/tests/test_tree.py b/datalad_next/tests/test_tree.py index 786a26ab..63a1ee42 100644 --- a/datalad_next/tests/test_tree.py +++ b/datalad_next/tests/test_tree.py @@ -665,6 +665,37 @@ class TestDatasetTree(TestTree): "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ +└── [DS~0] superds1/ + ├── sd1_dir0/ + │ ├── sd1_d0_repo0/ + │ └── [DS~1] sd1_d0_subds0/ + ├── [DS~0] sd1_ds0/ + └── [DS~1] (not installed) sd1_subds0/ +""" + }, + { + "dataset_depth": None, + "depth": 0, + "expected_stats_str": "7 datasets, 1 directory", + "expected_str": """ +├── [DS~0] superds0/ +│ └── [DS~1] sd0_subds0/ +│ └── [DS~2] sd0_sub0_subds0/ +└── [DS~0] superds1/ + ├── sd1_dir0/ + │ └── [DS~1] sd1_d0_subds0/ + ├── [DS~0] sd1_ds0/ + └── [DS~1] (not installed) sd1_subds0/ +""" + }, + { + "dataset_depth": None, + "depth": 2, + "expected_stats_str": "7 datasets, 2 directories", + "expected_str": """ +├── [DS~0] superds0/ +│ └── [DS~1] sd0_subds0/ +│ └── [DS~2] sd0_sub0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ @@ -688,11 +719,16 @@ def test_print_tree( self, dataset_depth, depth, expected_str ): root = str(self.path / "root") + + recursive_opts = ["--recursive"] + if dataset_depth is not None: + recursive_opts = ['--recursion-limit', str(dataset_depth)] + command = [ 'tree', root, '--depth', str(depth), - '--recursion-limit', str(dataset_depth) + *recursive_opts ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline @@ -709,7 +745,7 @@ def test_print_tree_without_datasets(self): 'tree', root, '--depth', '10', - '--recursion-limit', '10', + '--recursive', '--include-files' ] _, actual_res, _ = get_tree_rendered_output(command) @@ -724,11 +760,16 @@ def test_print_stats( self, dataset_depth, depth, expected_stats_str ): root = str(self.path / "root") + + recursive_opts = ["--recursive"] + if dataset_depth is not None: + recursive_opts = ['--recursion-limit', str(dataset_depth)] + command = [ 'tree', root, '--depth', str(depth), - '--recursion-limit', str(dataset_depth) + *recursive_opts ] _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str diff --git a/datalad_next/tree.py b/datalad_next/tree.py index 2a38221d..721cee4c 100644 --- a/datalad_next/tree.py +++ b/datalad_next/tree.py @@ -105,13 +105,14 @@ class TreeCommand(Interface): *Dataset discovery* - Using the [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] + Using the [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: + ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] option, this command generates the layout of dataset hierarchies based on subdataset nesting level, regardless of their location in the filesystem. In this case, tree depth is determined by subdataset depth. This mode - is therefore suited for discovering available datasets when their + is thus suited for discovering available datasets when their location is not known in advance. By default, only datasets are listed, without their contents. If @@ -126,7 +127,8 @@ class TreeCommand(Interface): datasets. **Performance note**: since no assumption is made on the location of - datasets, running this command with the [CMD: ``--recursion-limit`` CMD][PY: + datasets, running this command with the [CMD: ``--recursive`` CMD][PY: + ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] option does a full scan of the whole directory tree. As such, it can be significantly slower than a call with an equivalent output that uses [CMD: ``--depth`` CMD][PY: ``depth`` PY] to @@ -187,21 +189,27 @@ class TreeCommand(Interface): constraints=EnsureStr() | EnsureNone()), depth=Parameter( args=("-L", "--depth",), - doc="""maximum level of subdirectories to include in the tree. + doc="""limit the tree to maximum level of subdirectories. If not specified, will generate the full tree with no depth constraint. - If paired with + If paired with [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY], - refers to the maximum directory level to generate underneath + refers to the maximum directory level to output below each dataset.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), + recursive=Parameter( + args=("-r", "--recursive",), + doc="""produce a dataset tree of the full hierarchy of nested + subdatasets. *Note*: may have slow performance on large + directory trees.""", + action='store_true'), recursion_limit=Parameter( args=("-R", "--recursion-limit",), metavar="LEVELS", - doc="""maximum level of nested subdatasets to include in the - tree. 0 means only top-level datasets, 1 means top-level - datasets and their immediate subdatasets, etc. *Note*: may have - slow performance on large directory trees.""", + doc="""limit the dataset tree to maximum level of nested + subdatasets. 0 means include only top-level datasets, 1 means + top-level datasets and their immediate subdatasets, etc. *Note*: + may have slow performance on large directory trees.""", constraints=EnsureInt() & EnsureRange(min=0) | EnsureNone()), include_files=Parameter( args=("--include-files",), @@ -211,7 +219,8 @@ class TreeCommand(Interface): args=("--include-hidden",), doc="""include hidden files/directories in the tree. This option does not affect which directories will be searched for - datasets when specifying [CMD: ``--recursion-limit`` CMD][PY: + datasets when specifying [CMD: ``--recursive`` CMD][PY: + ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY]. For example, datasets located underneath the hidden folder `.datalad` will be reported even if [CMD: ``--include-hidden`` CMD][PY: ``include_hidden`` PY] is omitted.""", @@ -226,11 +235,11 @@ class TreeCommand(Interface): dict(text="Find all top-level datasets located anywhere under ``/tmp``", code_py="tree('/tmp', recursion_limit=0)", code_cmd="datalad tree /tmp -R 0"), - dict(text="Report first- and second-level subdatasets and their " - "directory contents, up to 1 subdirectory deep within each " + dict(text="Report all subdatasets recursively and their directory " + "contents, up to 1 subdirectory deep within each " "dataset", - code_py="tree(recursion_limit=2, depth=1)", - code_cmd="datalad tree -R 2 -L 1"), + code_py="tree(recursive=True, depth=1)", + code_cmd="datalad tree -r -L 1"), ] @staticmethod @@ -240,11 +249,12 @@ def __call__( path='.', *, depth=None, + recursive=False, recursion_limit=None, include_files=False, include_hidden=False): - if recursion_limit is not None: + if recursive or recursion_limit is not None: # special tree defined by subdataset nesting depth tree_cls = DatasetTree dataset_tree_args = {"max_dataset_depth": recursion_limit} @@ -852,9 +862,10 @@ class DatasetTree(Tree): Because of the different semantics of the ``max_depth`` parameter, this class is implemented as a separate subclass of ``Tree``. """ - def __init__(self, *args, max_dataset_depth=0, **kwargs): + def __init__(self, *args, max_dataset_depth=None, **kwargs): super().__init__(*args, **kwargs) + # default max_dataset_depth 'None' means unlimited subdataset deoth self.max_dataset_depth = max_dataset_depth if self.max_depth is None: # by default, do not include datasets' contents @@ -931,7 +942,8 @@ def is_excluded(n: _TreeNode): # for each dataset node, yield its parents first, then # yield the dataset itself if isinstance(node, DatasetNode) and \ - node.ds_depth <= self.max_dataset_depth and \ + (self.max_dataset_depth is None or + node.ds_depth <= self.max_dataset_depth) and \ not self.exclude_node_func(node): # yield parent directories if not already done From 86346185fc2ceba0dbbeb4bee88cac2198591ddf Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 24 Aug 2022 09:03:56 +0200 Subject: [PATCH 131/131] Add changelog snippet --- changelog.d/20220824_085736_michael.hanke_nf_tree.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 changelog.d/20220824_085736_michael.hanke_nf_tree.md diff --git a/changelog.d/20220824_085736_michael.hanke_nf_tree.md b/changelog.d/20220824_085736_michael.hanke_nf_tree.md new file mode 100644 index 00000000..d8f519d6 --- /dev/null +++ b/changelog.d/20220824_085736_michael.hanke_nf_tree.md @@ -0,0 +1,11 @@ +### 💫 Enhancements and new features + +- New `tree` command for traversing a directory hierarchy. + Like the UNIX equivalent, it can visualize a directory tree. + Additionally, it annotates the output with DataLad-related + information, like the location of dataset, and their nesting + depth. Besides visualization, `tree` also reports structured + data in the form of result records that enable other applications + to use `tree` for gathering data from the file system. + Fixes https://github.com/datalad/datalad-next/issues/78 via + https://github.com/datalad/datalad-next/pull/92 (by @catetrai)