Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datalad sensitive marking fixes #739

Merged
merged 4 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions heudiconv/external/dlad.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,16 @@ def add_to_datalad(
# annex_add_opts=['--include-dotfiles']
)

# TODO: filter for only changed files?
# Provide metadata for sensitive information
mark_sensitive(ds, "sourcedata")
mark_sensitive(ds, "*_scans.tsv") # top level
mark_sensitive(ds, "*/*_scans.tsv") # within subj
mark_sensitive(ds, "*/*/*_scans.tsv") # within sess/subj
mark_sensitive(ds, "*/anat") # within subj
mark_sensitive(ds, "*/*/anat") # within ses/subj
last_commit = "HEAD"
mark_sensitive(ds, "sourcedata", last_commit)
mark_sensitive(ds, "*_scans.tsv", last_commit) # top level
mark_sensitive(ds, "*/*_scans.tsv", last_commit) # within subj
mark_sensitive(ds, "*/*/*_scans.tsv", last_commit) # within sess/subj
mark_sensitive(ds, "*/anat", last_commit) # within subj
mark_sensitive(ds, "*/*/anat", last_commit) # within ses/subj
if dsh_path:
mark_sensitive(ds, ".heudiconv") # entire .heudiconv!
mark_sensitive(ds, ".heudiconv", last_commit) # entire .heudiconv!
superds.save(path=ds.path, message=msg, recursive=True)

assert not ds.repo.dirty
Expand All @@ -178,26 +178,36 @@ def add_to_datalad(
"""


def mark_sensitive(ds: Dataset, path_glob: str) -> None:
def mark_sensitive(ds: Dataset, path_glob: str, commit: str = None) -> None:
"""

Parameters
----------
ds : Dataset to operate on
path_glob : str
glob of the paths within dataset to work on
commit : str
commit which files to mark

Returns
-------
None
"""
paths = glob(op.join(ds.path, path_glob))
if commit:
paths_in_commit = [
op.join(ds.path, nf)
for nf in ds.repo.call_git(
["show", "--name-only", commit, "--format=oneline"]
).split("\n")[1:]
]
paths = [p for p in paths if p in paths_in_commit]
if not paths:
return
lgr.debug("Marking %d files with distribution-restrictions field", len(paths))
# set_metadata can be a bloody generator
res = ds.repo.set_metadata(
paths, init=dict([("distribution-restrictions", "sensitive")]), recursive=True
paths, add=dict([("distribution-restrictions", "sensitive")]), recursive=True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here I worried in case of --overwrite we might keep adding multiple distribution-restrictions=sensitive... but it seems that same value is not getting duplicated, only a new one

❯ git annex metadata --set distribution-restrictions+=sensitive sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-47
  lastchanged=2024-02-24@00-12-47
ok
(recording state in git...)
❯ git annex metadata --set distribution-restrictions+=sensitive sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-49
  lastchanged=2024-02-24@00-12-49
ok
(recording state in git...)
❯ git annex metadata  sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-49
  lastchanged=2024-02-24@00-12-49

only the time stamp would be changed ... since we are filtering on only saved files -- I think that should be good

)
if inspect.isgenerator(res):
res = list(res)
20 changes: 20 additions & 0 deletions heudiconv/external/tests/test_dlad.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,23 @@ def test_mark_sensitive(tmp_path: Path) -> None:
# g2 since the same content
assert not all_meta.pop("g1", None) # nothing or empty record
assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec}


def test_mark_sensitive_last_commit(tmp_path: Path) -> None:
ds = dl.Dataset(tmp_path).create(force=True)
create_tree(
str(tmp_path),
{
"f1": "d1",
"f2": "d2",
"g1": "d3",
"g2": "d1",
},
)
ds.save(".")
mark_sensitive(ds, "f*", "HEAD")
all_meta = dict(ds.repo.get_metadata("."))
target_rec = {"distribution-restrictions": ["sensitive"]}
# g2 since the same content
assert not all_meta.pop("g1", None) # nothing or empty record
assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec}
Loading