From 6ba6cbad1891c635fdf504f6adc02ff00b52f501 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Tue, 20 Feb 2024 13:50:27 -0500 Subject: [PATCH 1/4] mark sensitive only files added in the commit. metadata add rather than init to fix reruns issue --- heudiconv/external/dlad.py | 30 ++++++++++++++++++--------- heudiconv/external/tests/test_dlad.py | 20 ++++++++++++++++++ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/heudiconv/external/dlad.py b/heudiconv/external/dlad.py index 2d65c2b2..5e1b305a 100644 --- a/heudiconv/external/dlad.py +++ b/heudiconv/external/dlad.py @@ -153,16 +153,16 @@ def add_to_datalad( # annex_add_opts=['--include-dotfiles'] ) - # TODO: filter for only changed files? # Provide metadata for sensitive information - mark_sensitive(ds, "sourcedata") - mark_sensitive(ds, "*_scans.tsv") # top level - mark_sensitive(ds, "*/*_scans.tsv") # within subj - mark_sensitive(ds, "*/*/*_scans.tsv") # within sess/subj - mark_sensitive(ds, "*/anat") # within subj - mark_sensitive(ds, "*/*/anat") # within ses/subj + last_commit = "HEAD" + mark_sensitive(ds, "sourcedata", last_commit) + mark_sensitive(ds, "*_scans.tsv", last_commit) # top level + mark_sensitive(ds, "*/*_scans.tsv", last_commit) # within subj + mark_sensitive(ds, "*/*/*_scans.tsv", last_commit) # within sess/subj + mark_sensitive(ds, "*/anat", last_commit) # within subj + mark_sensitive(ds, "*/*/anat", last_commit) # within ses/subj if dsh_path: - mark_sensitive(ds, ".heudiconv") # entire .heudiconv! + mark_sensitive(ds, ".heudiconv", last_commit) # entire .heudiconv! superds.save(path=ds.path, message=msg, recursive=True) assert not ds.repo.dirty @@ -178,7 +178,7 @@ def add_to_datalad( """ -def mark_sensitive(ds: Dataset, path_glob: str) -> None: +def mark_sensitive(ds: Dataset, path_glob: str, commit: str = None) -> None: """ Parameters @@ -186,18 +186,28 @@ def mark_sensitive(ds: Dataset, path_glob: str) -> None: ds : Dataset to operate on path_glob : str glob of the paths within dataset to work on + commit : str + commit which files to mark Returns ------- None """ paths = glob(op.join(ds.path, path_glob)) + if commit: + paths_in_commit = [ + op.join(ds.path, nf) + for nf in ds.repo.call_git( + ["show", "--name-only", commit, "--format=oneline"] + ).split("\n")[1:] + ] + paths = [p for p in paths if p in paths_in_commit] if not paths: return lgr.debug("Marking %d files with distribution-restrictions field", len(paths)) # set_metadata can be a bloody generator res = ds.repo.set_metadata( - paths, init=dict([("distribution-restrictions", "sensitive")]), recursive=True + paths, add=dict([("distribution-restrictions", "sensitive")]), recursive=True ) if inspect.isgenerator(res): res = list(res) diff --git a/heudiconv/external/tests/test_dlad.py b/heudiconv/external/tests/test_dlad.py index 6518d648..5900311a 100644 --- a/heudiconv/external/tests/test_dlad.py +++ b/heudiconv/external/tests/test_dlad.py @@ -28,3 +28,23 @@ def test_mark_sensitive(tmp_path: Path) -> None: # g2 since the same content assert not all_meta.pop("g1", None) # nothing or empty record assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec} + + +def test_mark_sensitive_last_commit(tmp_path: Path) -> None: + ds = dl.Dataset(tmp_path).create(force=True) + create_tree( + str(tmp_path), + { + "f1": "d1", + "f2": "d2", + "g1": "d3", + "g2": "d1", + }, + ) + ds.save(".") + mark_sensitive(ds, "f*", "HEAD") + all_meta = dict(ds.repo.get_metadata(".")) + target_rec = {"distribution-restrictions": ["sensitive"]} + # g2 since the same content + assert not all_meta.pop("g1", None) # nothing or empty record + assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec} From 2844d5413d5089126de66fbf2e05b9427c3715b2 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 21 Feb 2024 10:19:37 -0500 Subject: [PATCH 2/4] filter mark_sensitive based on save output --- heudiconv/external/dlad.py | 38 +++++++++++++-------------- heudiconv/external/tests/test_dlad.py | 7 ++--- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/heudiconv/external/dlad.py b/heudiconv/external/dlad.py index 5e1b305a..5ea57aa6 100644 --- a/heudiconv/external/dlad.py +++ b/heudiconv/external/dlad.py @@ -146,23 +146,27 @@ def add_to_datalad( message="Added gitattributes to place all .heudiconv content" " under annex", ) - ds.save( + save_res = ds.save( ".", recursive=True # not in effect! ? # annex_add_opts=['--include-dotfiles'] ) + annexed_files = [sr["path"] for sr in save_res if sr["key"]] # Provide metadata for sensitive information - last_commit = "HEAD" - mark_sensitive(ds, "sourcedata", last_commit) - mark_sensitive(ds, "*_scans.tsv", last_commit) # top level - mark_sensitive(ds, "*/*_scans.tsv", last_commit) # within subj - mark_sensitive(ds, "*/*/*_scans.tsv", last_commit) # within sess/subj - mark_sensitive(ds, "*/anat", last_commit) # within subj - mark_sensitive(ds, "*/*/anat", last_commit) # within ses/subj + sensitive_patterns = [ + "sourcedata", + "*_scans.tsv", # top level + "*/*_scans.tsv", # within subj + "*/*/*_scans.tsv", # within sess/subj + "*/anat", # within subj + "*/*/anat", # within ses/subj + ] + for sp in sensitive_patterns: + mark_sensitive(ds, sp, annexed_files) if dsh_path: - mark_sensitive(ds, ".heudiconv", last_commit) # entire .heudiconv! + mark_sensitive(ds, ".heudiconv") # entire .heudiconv! superds.save(path=ds.path, message=msg, recursive=True) assert not ds.repo.dirty @@ -178,7 +182,7 @@ def add_to_datalad( """ -def mark_sensitive(ds: Dataset, path_glob: str, commit: str = None) -> None: +def mark_sensitive(ds: Dataset, path_glob: str, files: list[str] = None) -> None: """ Parameters @@ -186,22 +190,16 @@ def mark_sensitive(ds: Dataset, path_glob: str, commit: str = None) -> None: ds : Dataset to operate on path_glob : str glob of the paths within dataset to work on - commit : str - commit which files to mark + files : list[str] + subset of files to mark Returns ------- None """ paths = glob(op.join(ds.path, path_glob)) - if commit: - paths_in_commit = [ - op.join(ds.path, nf) - for nf in ds.repo.call_git( - ["show", "--name-only", commit, "--format=oneline"] - ).split("\n")[1:] - ] - paths = [p for p in paths if p in paths_in_commit] + if files: + paths = [p for p in paths if p in files] if not paths: return lgr.debug("Marking %d files with distribution-restrictions field", len(paths)) diff --git a/heudiconv/external/tests/test_dlad.py b/heudiconv/external/tests/test_dlad.py index 5900311a..325744a7 100644 --- a/heudiconv/external/tests/test_dlad.py +++ b/heudiconv/external/tests/test_dlad.py @@ -30,7 +30,7 @@ def test_mark_sensitive(tmp_path: Path) -> None: assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec} -def test_mark_sensitive_last_commit(tmp_path: Path) -> None: +def test_mark_sensitive_subset(tmp_path: Path) -> None: ds = dl.Dataset(tmp_path).create(force=True) create_tree( str(tmp_path), @@ -42,9 +42,10 @@ def test_mark_sensitive_last_commit(tmp_path: Path) -> None: }, ) ds.save(".") - mark_sensitive(ds, "f*", "HEAD") + mark_sensitive(ds, "f*", [str(tmp_path / "f1")]) all_meta = dict(ds.repo.get_metadata(".")) target_rec = {"distribution-restrictions": ["sensitive"]} # g2 since the same content assert not all_meta.pop("g1", None) # nothing or empty record - assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec} + assert not all_meta.pop("f2", None) # nothing or empty record + assert all_meta == {"f1": target_rec, "g2": target_rec} From 328aae8a60fc3f11d0e78685b733f19d38dcee3c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 21 Feb 2024 10:23:35 -0500 Subject: [PATCH 3/4] fix typing --- heudiconv/external/dlad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heudiconv/external/dlad.py b/heudiconv/external/dlad.py index 5ea57aa6..660f02f0 100644 --- a/heudiconv/external/dlad.py +++ b/heudiconv/external/dlad.py @@ -182,7 +182,7 @@ def add_to_datalad( """ -def mark_sensitive(ds: Dataset, path_glob: str, files: list[str] = None) -> None: +def mark_sensitive(ds: Dataset, path_glob: str, files: list[str] | None = None) -> None: """ Parameters From 78db99eb8b5c2cb5436475a57326e990aebcab38 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 21 Feb 2024 10:32:43 -0500 Subject: [PATCH 4/4] fix: get key properly for git-files --- heudiconv/external/dlad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heudiconv/external/dlad.py b/heudiconv/external/dlad.py index 660f02f0..dd9daeae 100644 --- a/heudiconv/external/dlad.py +++ b/heudiconv/external/dlad.py @@ -152,7 +152,7 @@ def add_to_datalad( # not in effect! ? # annex_add_opts=['--include-dotfiles'] ) - annexed_files = [sr["path"] for sr in save_res if sr["key"]] + annexed_files = [sr["path"] for sr in save_res if sr.get("key", None)] # Provide metadata for sensitive information sensitive_patterns = [