From a4f51868cb06613f27c8bc341e594af4e8f0fa7b Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 11:38:38 +0100 Subject: [PATCH 1/6] Start with a full file copy for creating a non-exporting base class --- datalad_dataverse/baseremote.py | 726 ++++++++++++++++++++++++++++++++ datalad_dataverse/remote.py | 4 +- 2 files changed, 729 insertions(+), 1 deletion(-) create mode 100644 datalad_dataverse/baseremote.py diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py new file mode 100644 index 0000000..462c112 --- /dev/null +++ b/datalad_dataverse/baseremote.py @@ -0,0 +1,726 @@ +from __future__ import annotations + +from pathlib import Path +import re +import sys + +from annexremote import ExportRemote +from collections import namedtuple +from pyDataverse.api import DataAccessApi +from pyDataverse.models import Datafile +from requests import delete +from requests.auth import HTTPBasicAuth +from shutil import which + +from datalad_next.annexremotes import ( + RemoteError, + SpecialRemote, + UnsupportedRequest, + super_main, +) +from datalad_next.credman import CredentialManager +# this important is a vast overstatement, we only need +# `AnnexRepo.config`, nothing else +from datalad_next.datasets import LegacyAnnexRepo as AnnexRepo + +from datalad_dataverse.utils import ( + get_api, + format_doi, + mangle_directory_names, +) + +# Object to hold what's on dataverse's end for a given database id. +# We need the paths in the latest version (if the id is part of that) in order +# to know whether we need to replace rather than just upload a file, and we need +# to know whether an id is released, since that implies we can't replace it +# (but we could change the metadata, right?) and we can't actually delete it. +# The latter meaning: It can be removed from the new DRAFT version, but it's +# still available via its id from an older version of the dataverse dataset. +# This namedtuple is meant to be the value type of a dict with ids as its keys: +FileIdRecord = namedtuple("FileIdRecord", ["path", "is_released"]) + +# Needed to determine whether RENAMEEXPORT can be considered implemented. +CURL_EXISTS = which('curl') is not None + + +class DataverseRemote(ExportRemote, SpecialRemote): + """Special remote to interface dataverse datasets. + + There are two modes of operation: + - "Regular" special remote (configured with exporttree=false; default), + where a flat ist of annex keys is put into that dataverse dataset and + - export remote (configured with exporttree=yes), where the actual worktree + is put into the dataverse dataset + + Dataverse + --------- + + Dataverse datasets come with their own versioning. A version is created upon + publishing a draft version. When a change is pushed, it is altering an + already existing draft version or - if none existed - the push (implicitly) + creates a new draft version. Publishing is not part of this special remotes + operations as it has no means to "discover" that this should happen (it only + communicates with git-annex on a per-file basis and does not even know what + annex command ran). + + Files put on dataverse have a database ID associated with them, while there + "path" in the dataverse dataset is treated as metadata to that file. The ID + is persistent, but not technically a content identifier as it is not created + from the content like hash. However, once files are published (by being part + of a published dataset version) those IDs can serve as a content identifier + for practical purposes, since they are not going to change anymore. There's + no "real" guarantee for that, but in reality changing it would require some + strange DB migration to be performed on the side of the respective dataverse + instance. Note, however, that a file can be pushed into a draft version and + replaced/removed before it was ever published. In that case the ID of an + annex key could be changed. Hence, to some extent the special remote needs + to be aware of whether an annex key and its ID was part of a released + version of the dataverse dataset in order to make use of those IDs. + + Recording the IDs allows accessing older versions of a file even in export + mode, as well as faster accessing keys for download. The latter is because + the API requires the ID, and a path based approach would therefore require + looking up the ID first (adding a request). Therefore, the special remote + records the IDs of annex keys and tries to rely on them if possible. + + There is one more trap to mention with dataverse and that is its limitations + to directory and file names. + See https://github.com/IQSS/dataverse/issues/8807#issuecomment-1164434278 + + Regular special remote + ---------------------- + + In principle the regular special remote simply maintains a flat list of + annex keys in the dataverse dataset, where the presented file names are the + anney keys. Therefore, it is feasible to simply rely on the remote path of a + key when checking for its presence. However, as laid out above, it is faster + to utilize knowledge about the database ID, so the idea is to use path + matching only as a fallback. + + Export remote + ------------- + + In export mode the special remote can not conclude the annex key from a + remote path in general. In order to be able to access versions of a file + that are not part of the latest version (draft or not) of the dataverse + dataset, reliance on recorded database IDs is crucial. + + Implementation note + ------------------- + + The special remote at first only retrieves a record of what is in the latest + version (draft or not) of the dataverse dataset including an annotation of + content on whether it is released. This annotation is crucial, since it has + implications on what to record should changes be pushed to it. + For example: + It is not possible to actually remove content from a released version. That + means, if annex asks the special remote to remove content, it can only make + sure that the respective key is not part of the current draft anymore. Its + ID, however, remains on record. If the content was not released yet, it is + actually gone and the ID is taken off the record. + + This record is retrieved lazily when first required, but only once (avoiding + an additional per-key request) and then updated locally when changes are + pushed. (Note, that we know that we only ever push into a draft version) + In case of checking the presence of a key that does not appear to be part of + the latest version, a request for such a record on all known dataverse + dataset versions is made. Again, this is lazy and only one request. This may + potentially be a relatively expensive request, but the introduced latency by + having smaller but possibly much more requests is likely a lot more + expensive. + """ + + def __init__(self, *args): + super().__init__(*args) + self.configs['url'] = 'The Dataverse URL for the remote' + self.configs['doi'] = 'DOI to the dataset' + self.configs['dlacredential'] = \ + 'Identifier used to retrieve an API token from a local ' \ + 'credential store' + # dataverse dataset identifier + self._doi = None + # dataverse instance URL + self._url = None + # dataverse native API handle + self._api = None + self._data_access_api = None + self._token = None + self._old_dataset_versions = None + self._dataset_latest = None + self._files_old = None + self._files_latest = None + self.is_draft = None + + # + # Essential API + # + def prepare(self): + # remove any trailing slash from URL + url = self.annex.getconfig('url').rstrip('/') + if not url: + raise ValueError('url must be specified') + doi = self.annex.getconfig('doi') + if not doi: + raise ValueError('doi must be specified') + # standardize formating to minimize complexity downstream + self._doi = format_doi(doi) + self._url = url + # we need an acces token, use the repo's configmanager to + # query for one + repo = AnnexRepo(self.annex.getgitdir()) + # TODO the below is almost literally taken from + # the datalad-annex:: implementation in datalad-next + # this could become a comming helper + # TODO https://github.com/datalad/datalad-dataverse/issues/171 + credman = CredentialManager(repo.config) + credential_name = self.annex.getconfig('dlacredential') + api = get_api( + self._url, + credman, + credential_name=credential_name, + ) + # store for reuse with data access API. + # we do not initialize that one here, because it is only used + # for file downloads + self._token = api.api_token + self._api = api + + def initremote(self): + """ + Use this command to initialize a remote + git annex initremote dv1 type=external externaltype=dataverse encryption=none + """ + # we also need an active API connection for initremote, + # simply run prepare() + self.prepare() + # check if instance is readable and authenticated + resp = self._api.get_info_version() + if resp.json()['status'] != 'OK': + raise RuntimeError(f'Cannot connect to dataverse instance ' + f'(status: {resp.json()["status"]})') + + # check if project with specified doi exists + dv_ds = self._api.get_dataset(identifier=self._doi) + if not dv_ds.ok: + raise RuntimeError("Cannot find dataset") + + def checkpresent(self, key): + stored_id = self._get_annex_fileid_record(key) + if stored_id is not None: + # First, check latest version. Second, check older versions. + # This is to avoid requesting the full file list unless necessary. + return stored_id in self.files_latest.keys() or \ + stored_id in self.files_old.keys() + else: + # We do not have an ID on record for this key. + # Fall back to filename matching for two reasons: + # 1. We have to deal with the special keys of the datalad-annex + # git-remote-helper. They must be matched by name, since the + # throwaway repo using them doesn't have a relevant git-annex + # branch with an ID record (especially when cloning via the + # git-remote-helper) + # 2. We are in "regular annex mode" here - keys are stored under + # their name. Falling back to name matching allows to recover + # data, despite a lost or not generated id record for it. For + # example on could have uploaded lots of data via git-annex-copy, + # but failed to push the git-annex branch somewhere. + return Path(key) in [f.path for f in self.files_latest.values()] or \ + Path(key) in [f.path for f in self.files_old.values()] + + def transfer_store(self, key, local_file): + datafile = Datafile() + datafile.set({'filename': key, 'label': key}) + datafile.set({'pid': self._doi}) + + self._upload_file(datafile=datafile, + key=key, + local_file=local_file, + remote_file=Path(key)) + + def transfer_retrieve(self, key, file): + stored_id = self._get_annex_fileid_record(key) + if stored_id is not None: + file_id = stored_id + else: + # Like in `self.checkpresent`, we fall back to path matching. + # Delayed checking for availability from old versions is included. + file_id = self._get_fileid_from_key(key, latest_only=False) + if file_id is None: + raise RemoteError(f"Key {key} unavailable") + + self._download_file(file_id, file) + + def remove(self, key): + remote_file = Path(key) + self._remove_file(key, remote_file) + + # + # Export API + # + def checkpresentexport(self, key, remote_file): + stored_id = self._get_annex_fileid_record(key) + if stored_id is not None: + # Only check latest version in export mode. Doesn't currently + # work for keys from older versions, since annex fails to even + # try. See https://github.com/datalad/datalad-dataverse/issues/146#issuecomment-1214409351. + return stored_id in self.files_latest.keys() + else: + # In export mode, we need to fix remote paths: + remote_file = mangle_directory_names(remote_file) + return remote_file in [f.path for f in self.files_latest.values()] + + def transferexport_store(self, key, local_file, remote_file): + remote_file = mangle_directory_names(remote_file) + # TODO: See + # https://github.com/datalad/datalad-dataverse/issues/83#issuecomment-1214406034 + if re.search(pattern=r'[^a-z0-9_\-.\\/\ ]', + string=str(remote_file.parent), + flags=re.ASCII | re.IGNORECASE): + self.annex.error(f"Invalid character in directory name of " + f"{str(remote_file)}. Valid characters are a-Z, " + f"0-9, '_', '-', '.', '\\', '/' and ' ' " + f"(white space).") + + datafile = Datafile() + datafile.set({'filename': remote_file.name, + 'directoryLabel': str(remote_file.parent), + 'label': remote_file.name, + 'pid': self._doi}) + + self._upload_file(datafile, key, local_file, remote_file) + + def transferexport_retrieve(self, key, local_file, remote_file): + # In export mode, we need to fix remote paths: + remote_file = mangle_directory_names(remote_file) + + file_id = self._get_annex_fileid_record(key) \ + or self._get_fileid_from_exportpath(remote_file, latest_only=True) + if file_id is None: + raise RemoteError(f"Key {key} unavailable") + + self._download_file(file_id, local_file) + + def removeexport(self, key, remote_file): + remote_file = mangle_directory_names(remote_file) + self._remove_file(key, remote_file) + + def renameexport(self, key, filename, new_filename): + """Moves an exported file. + + If implemented, this is called by annex-export when a file was moved. + Otherwise annex calls removeexport + transferexport_store, which doesn't + scale well performance-wise. + """ + # Note: In opposition to other API methods, `update_datafile_metadata` + # is running `curl` in a subprocess. No idea why. As a consequence, this + # depends on the availability of curl and the return value is not (as in + # all other cases) a `requests.Response` object, but a + # `subprocess.CompletedProcess`. + # This apparently is planned to be changed in pydataverse 0.4.0: + # https://github.com/gdcc/pyDataverse/issues/88 + if not CURL_EXISTS: + raise UnsupportedRequest() + + filename = mangle_directory_names(filename) + new_filename = mangle_directory_names(new_filename) + + file_id = self._get_annex_fileid_record(key) \ + or self._get_fileid_from_exportpath(filename, latest_only=True) + if file_id is None: + raise RemoteError(f"{key} not available for renaming") + + datafile = Datafile() + datafile.set({'filename': new_filename.name, + 'directoryLabel': str(new_filename.parent), + 'label': new_filename.name, + 'pid': self._doi}) + + proc = self._api.update_datafile_metadata( + file_id, + json_str=datafile.json(), + is_filepid=False, + ) + if proc.returncode: + raise RemoteError(f"Renaming failed: {proc.stderr}") + + # + # Helpers + # + @property + def data_access_api(self): + if self._data_access_api is None: + self._data_access_api = DataAccessApi( + base_url=self._url, + # this relies on having established the NativeApi in prepare() + api_token=self._token, + ) + return self._data_access_api + + @property + def old_dataset_versions(self): + """Full JSON record of the dataverse dataset. + + This is requested once when relevant to look for a key that is not + present in the latest version of the dataverse dataset. In such case, + `files_old` is build from it. + """ + + if self._old_dataset_versions is None: + # This delivers a full record of all known versions of this dataset. + # Hence, the file lists in the version entries may contain + # duplicates (unchanged files across versions). + self.message("Request all dataset versions", type='debug') + versions = self._api.get_dataset_versions(self._doi) + versions.raise_for_status() + + self._old_dataset_versions = versions.json()['data'] + # Expected structure in self._dataset is a list of (version-) + # dictionaries, which should have a field 'files'. This again is a + # list of dicts like this: + # {'description': '', + # 'label': 'third_file.md', + # 'restricted': False, + # 'directoryLabel': 'subdir2', + # 'version': 1, + # 'datasetVersionId': 72, + # 'dataFile': {'id': 682, + # 'persistentId': '', + # 'pidURL': '', + # 'filename': 'third_file.md', + # 'contentType': 'text/plain', + # 'filesize': 9, + # 'description': '', + # 'storageIdentifier': 'local://1821bc70e68-c3c9dedcfce6', + # 'rootDataFileId': -1, + # 'md5': 'd8d77109f4a24efc3bd53d7cabb7ee35', + # 'checksum': {'type': 'MD5', + # 'value': 'd8d77109f4a24efc3bd53d7cabb7ee35'}, + # 'creationDate': '2022-07-20'} + + # Sort by version, so we can rely on the last entry to refer to the + # latest version. + # Note, that ('versionNumber', 'versionMinorNumber', 'versionState') + # would look like this: + # (None, None, 'DRAFT'), (2, 0, 'RELEASED'), (1, 0, 'RELEASED') + # and we need a possible DRAFT to have the greatest key WRT sorting. + self._old_dataset_versions.sort( + key=lambda v: (v.get('versionNumber') or sys.maxsize, + v.get('versionMinorNumber') or sys.maxsize), + reverse=False) + # Remove "latest" - we already have that + self._old_dataset_versions = self._old_dataset_versions[:-1] + + return self._old_dataset_versions + + @property + def dataset_latest(self): + """JSON representation on the latest version of the dataverse dataset. + + This is used to initialize `files_latest` and only requested once. + """ + + if self._dataset_latest is None: + self.message("Request latest dataset version", type='debug') + dataset = self._api.get_dataset( + identifier=self._doi, + version=":latest", + ) + dataset.raise_for_status() + self._dataset_latest = dataset.json()['data']['latestVersion'] + return self._dataset_latest + + @property + def files_old(self): + """Files available from older dataverse dataset versions. + + For quick lookup and deduplication, this is a dict {id: FileIdRecord} + """ + + if self._files_old is None: + self._files_old = { + f['dataFile']['id']: FileIdRecord( + Path(f.get('directoryLabel', '')) / f['dataFile']['filename'], + True # older versions are always released + ) + for file_lists in [ + (version['files'], version['versionState']) + for version in self.old_dataset_versions + ] + for f in file_lists[0] + } + + return self._files_old + + @property + def files_latest(self): + """Cache of files in the latest version of the dataverse dataset. + + This refers to the DRAFT version (if there is any) or the latest + published version otherwise. That's the version pushes go into. Hence, + this is needed to determine whether we need and can replace/remove a + file, while the complete list in `self.files_old` is relevant for key + retrieval of keys that are not present in the latest version anymore. + + Note, that whie initially we may not be in a draft, we are as soon as we + change things (upload/repace/remove/rename). We keep track of those + changes herein w/o rerequesting the new state. + """ + + if self._files_latest is None: + # Latest version in self.dataset is first entry. + self._files_latest = { + f['dataFile']['id']: FileIdRecord( + Path(f.get('directoryLabel', '')) / f['dataFile']['filename'], + self.dataset_latest['versionState'] == "RELEASED", + ) + for f in self.dataset_latest['files'] + } + + return self._files_latest + + def remove_from_filelist(self, id): + """Update self.files_latest after removal""" + # make sure this property actually exists before assigning: + # (This may happen when git-annex-export decides to remove a key w/o + # even considering checkpresent) + self.files_latest + self._files_latest.pop(id, None) + + def add_to_filelist(self, d): + """Update self.files_latest after upload + + d: dict + dataverse description dict of the file; this dict is in the list + 'data.files' of the response to a successful upload + """ + # make sure this property actually exists before assigning: + # (This may happen on `git-annex-copy --fast`) + self.files_latest + + self._files_latest[d['dataFile']['id']] = FileIdRecord( + Path(d.get('directoryLabel', '')) / d['dataFile']['filename'], + False # We just added - it can't be released + ) + + def _get_annex_fileid_record(self, key: str) -> int | None: + """Get the dataverse database id from the git-annex branch + + This is using the getstate/setstate special remote feature. Hence, a + stored id only exists, if the key was put to the dataverse instance by + this special remote. + + Parameters + ---------- + key: str + annex key to retrieve the id for + + Returns + ------- + int or None + """ + stored_id = self.annex.getstate(key) + if stored_id == "": + return None + else: + return int(stored_id) + + def _set_annex_fileid_record(self, key, id): + """Store a dataverse database id for a given key + + Parameters + ---------- + key: str + annex key to store the id for + id: int or str + dataverse database id for `key`. Empty string to unset. + """ + self.annex.setstate(key, str(id)) + + def _get_fileid_from_key(self, + key: str, + *, + latest_only: bool) -> int | None: + """Get the id of a dataverse file, that matches a given annex key + dataverse dataset. + + This method assumes that keys are deposited under paths that are + identical to the key name. + + Parameters + ---------- + key: + Annex key to perform the lookup for + latest_only: bool + Whether to only consider the latest version on dataverse. If + `False`, matching against older versions will only be performed + when there was no match in the latest version (implies that an + additional request may be performed) + + Returns + ------- + int or None + """ + # for now this also just performs a look-up by path + # but if other metadata-based lookups become possible + # this implementation could change + # https://github.com/datalad/datalad-dataverse/issues/188 + return self._get_fileid_from_exportpath( + Path(key), + latest_only=latest_only, + ) + + def _get_fileid_from_exportpath(self, + path: Path, + *, + latest_only: bool) -> int | None: + """Get the id of a dataverse file, that matches a given `Path` in the + dataverse dataset. + + Parameters + ---------- + path: Path + latest_only: bool + Whether to only consider the latest version on dataverse. If + `False`, matching against older versions will only be performed + when there was no match in the latest version (implies that an + additional request may be performed) + + Returns + ------- + int or None + """ + existing_id = [i for i, f in self.files_latest.items() + if f.path == path] + if not latest_only and not existing_id: + existing_id = [i for i, f in self.files_old.items() + if f.path == path] + return existing_id[0] if existing_id else None + + def _upload_file(self, datafile, key, local_file, remote_file): + """helper for both transfer-store methods""" + # If the remote path already exists, we need to replace rather than + # upload the file, since otherwise dataverse would rename the file on + # its end. However, this only concerns the latest version of the dataset + # (which is what we are pushing into)! + replace_id = self._get_fileid_from_exportpath( + remote_file, latest_only=True) + if replace_id is not None: + self.message(f"Replacing {remote_file} ...", type='debug') + response = self._api.replace_datafile( + identifier=replace_id, + filename=local_file, + json_str=datafile.json(), + is_filepid=False, + ) + else: + self.message(f"Uploading {remote_file} ...", type='debug') + response = self._api.upload_datafile( + identifier=self._doi, + filename=local_file, + json_str=datafile.json(), + ) + + if response.status_code == 400 and \ + response.json()['status'] == "ERROR" and \ + "duplicate content" in response.json()['message']: + # Ignore this one for now. + # TODO: This needs better handling. Currently, this happens in + # git-annex-testremote ("store when already present"). + # Generally it's kinda fine, but we'd better figure this out more + # reliably. Note, that we have to deal with annex keys, which are + # not hash based (for example the special keys fo datalad-annex + # git-remote-helper). + # Hence, having the key on the remote end, doesn't mean it's + # identical. So, we can't catch it beforehand this way. + self.message( + f"Failed to upload {key}, since dataverse says we are " + f"replacing with duplicate content.", type='debug' + ) + return # nothing changed and nothing needs to be done + else: + response.raise_for_status() + + # Success. + + # If we replaced, `replaced_id` is not part of the latest version + # anymore. + if replace_id is not None: + self.remove_from_filelist(replace_id) + # In case of replace we need to figure whether the replaced + # ID was part of a DRAFT version only. In that case it's gone and + # we'd need to remove the ID record. Otherwise, it's still retrieval + # from an old, published version. + # Note, that this would potentially trigger the request of the full + # file list (`self.files_old`). + if not (self.files_latest[replace_id].is_released or + replace_id in self.files_old.keys()): + self._set_annex_fileid_record(key, "") + + uploaded_file = response.json()['data']['files'][0] + # update cache: + self.add_to_filelist(uploaded_file) + # remember dataverse's database id for this key + self._set_annex_fileid_record(key, uploaded_file['dataFile']['id']) + + def _download_file(self, file_id, local_file): + """helper for both transfer-retrieve methods""" + response = self.data_access_api.get_datafile(file_id) + # http error handling + response.raise_for_status() + with open(local_file, "wb") as f: + f.write(response.content) + + def _remove_file(self, key, remote_file): + """helper for both remove methods""" + rm_id = self._get_annex_fileid_record(key) \ + or self._get_fileid_from_exportpath(remote_file, latest_only=True) + + if rm_id is None: + # We didn't find anything to remove. That should be fine and + # considered a successful removal by git-annex. + return + if rm_id not in self.files_latest.keys(): + # We can't remove from older (hence published) versions. + return + + status = delete( + f'{self._url}/dvn/api/data-deposit/v1.1/swordv2/' + f'edit-media/file/{rm_id}', + # this relies on having established the NativeApi in prepare() + auth=HTTPBasicAuth(self._token, '')) + # http error handling + status.raise_for_status() + # We need to figure whether the removed ID was part of a released + # version. In that case it's still retrievable from an old, published + # version. + # Note, that this would potentially trigger the request of the full + # file list (`self.files_old`). + if not (self.files_latest[rm_id].is_released or + rm_id in self.files_old.keys()): + self.message(f"Unset stored id for {key}", type='debug') + self._set_annex_fileid_record(key, "") + else: + # Despite not actually deleting from the dataverse database, we + # currently loose access to the old key (in export mode, that is), + # because annex registers a successful REMOVEEXPORT and there seems + # to be no way to make annex even try to run a CHECKPRESENT(-EXPORT) + # on an export remote in such case. get, fsck, checkpresentkey - + # none of them would do. + # TODO: We could try to setpresenturl for the not-really-removed + # file, if it has a persistent URL (should be findable in + # self.old_dataset_versions) or even via api/access/datafile/811. + # However, that depends on permissions, etc., so not clear it's + # useful or desireable to always do that. + # Otherwise not seeing a solution ATM. See https://github.com/datalad/datalad-dataverse/issues/146#issuecomment-1214409351 + pass + # This ID is not part of the latest version anymore. + self.remove_from_filelist(rm_id) + + +def main(): + """cmdline entry point""" + super_main( + cls=DataverseRemote, + remote_name='dataverse', + description="transport file content to and from a Dataverse dataset", + ) diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index 462c112..f3017e8 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -29,6 +29,8 @@ mangle_directory_names, ) +from .baseremote import DataverseRemote as BaseDataverseRemote + # Object to hold what's on dataverse's end for a given database id. # We need the paths in the latest version (if the id is part of that) in order # to know whether we need to replace rather than just upload a file, and we need @@ -43,7 +45,7 @@ CURL_EXISTS = which('curl') is not None -class DataverseRemote(ExportRemote, SpecialRemote): +class DataverseRemote(ExportRemote, BaseDataverseRemote): """Special remote to interface dataverse datasets. There are two modes of operation: From d4ebc2371775348e7221dd0592529cc8a1199d0f Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 11:45:56 +0100 Subject: [PATCH 2/6] Deduplicate primary API --- datalad_dataverse/baseremote.py | 95 +------------------- datalad_dataverse/remote.py | 153 ++------------------------------ 2 files changed, 6 insertions(+), 242 deletions(-) diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py index 462c112..a981afe 100644 --- a/datalad_dataverse/baseremote.py +++ b/datalad_dataverse/baseremote.py @@ -1,10 +1,8 @@ from __future__ import annotations from pathlib import Path -import re import sys -from annexremote import ExportRemote from collections import namedtuple from pyDataverse.api import DataAccessApi from pyDataverse.models import Datafile @@ -15,7 +13,6 @@ from datalad_next.annexremotes import ( RemoteError, SpecialRemote, - UnsupportedRequest, super_main, ) from datalad_next.credman import CredentialManager @@ -26,7 +23,6 @@ from datalad_dataverse.utils import ( get_api, format_doi, - mangle_directory_names, ) # Object to hold what's on dataverse's end for a given database id. @@ -43,7 +39,7 @@ CURL_EXISTS = which('curl') is not None -class DataverseRemote(ExportRemote, SpecialRemote): +class DataverseRemote(SpecialRemote): """Special remote to interface dataverse datasets. There are two modes of operation: @@ -254,95 +250,6 @@ def remove(self, key): remote_file = Path(key) self._remove_file(key, remote_file) - # - # Export API - # - def checkpresentexport(self, key, remote_file): - stored_id = self._get_annex_fileid_record(key) - if stored_id is not None: - # Only check latest version in export mode. Doesn't currently - # work for keys from older versions, since annex fails to even - # try. See https://github.com/datalad/datalad-dataverse/issues/146#issuecomment-1214409351. - return stored_id in self.files_latest.keys() - else: - # In export mode, we need to fix remote paths: - remote_file = mangle_directory_names(remote_file) - return remote_file in [f.path for f in self.files_latest.values()] - - def transferexport_store(self, key, local_file, remote_file): - remote_file = mangle_directory_names(remote_file) - # TODO: See - # https://github.com/datalad/datalad-dataverse/issues/83#issuecomment-1214406034 - if re.search(pattern=r'[^a-z0-9_\-.\\/\ ]', - string=str(remote_file.parent), - flags=re.ASCII | re.IGNORECASE): - self.annex.error(f"Invalid character in directory name of " - f"{str(remote_file)}. Valid characters are a-Z, " - f"0-9, '_', '-', '.', '\\', '/' and ' ' " - f"(white space).") - - datafile = Datafile() - datafile.set({'filename': remote_file.name, - 'directoryLabel': str(remote_file.parent), - 'label': remote_file.name, - 'pid': self._doi}) - - self._upload_file(datafile, key, local_file, remote_file) - - def transferexport_retrieve(self, key, local_file, remote_file): - # In export mode, we need to fix remote paths: - remote_file = mangle_directory_names(remote_file) - - file_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(remote_file, latest_only=True) - if file_id is None: - raise RemoteError(f"Key {key} unavailable") - - self._download_file(file_id, local_file) - - def removeexport(self, key, remote_file): - remote_file = mangle_directory_names(remote_file) - self._remove_file(key, remote_file) - - def renameexport(self, key, filename, new_filename): - """Moves an exported file. - - If implemented, this is called by annex-export when a file was moved. - Otherwise annex calls removeexport + transferexport_store, which doesn't - scale well performance-wise. - """ - # Note: In opposition to other API methods, `update_datafile_metadata` - # is running `curl` in a subprocess. No idea why. As a consequence, this - # depends on the availability of curl and the return value is not (as in - # all other cases) a `requests.Response` object, but a - # `subprocess.CompletedProcess`. - # This apparently is planned to be changed in pydataverse 0.4.0: - # https://github.com/gdcc/pyDataverse/issues/88 - if not CURL_EXISTS: - raise UnsupportedRequest() - - filename = mangle_directory_names(filename) - new_filename = mangle_directory_names(new_filename) - - file_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(filename, latest_only=True) - if file_id is None: - raise RemoteError(f"{key} not available for renaming") - - datafile = Datafile() - datafile.set({'filename': new_filename.name, - 'directoryLabel': str(new_filename.parent), - 'label': new_filename.name, - 'pid': self._doi}) - - proc = self._api.update_datafile_metadata( - file_id, - json_str=datafile.json(), - is_filepid=False, - ) - if proc.returncode: - raise RemoteError(f"Renaming failed: {proc.stderr}") - # # Helpers # diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index f3017e8..fb5f82b 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -5,44 +5,26 @@ import sys from annexremote import ExportRemote -from collections import namedtuple from pyDataverse.api import DataAccessApi from pyDataverse.models import Datafile from requests import delete from requests.auth import HTTPBasicAuth -from shutil import which from datalad_next.annexremotes import ( RemoteError, - SpecialRemote, UnsupportedRequest, super_main, ) -from datalad_next.credman import CredentialManager -# this important is a vast overstatement, we only need -# `AnnexRepo.config`, nothing else -from datalad_next.datasets import LegacyAnnexRepo as AnnexRepo from datalad_dataverse.utils import ( - get_api, - format_doi, mangle_directory_names, ) -from .baseremote import DataverseRemote as BaseDataverseRemote - -# Object to hold what's on dataverse's end for a given database id. -# We need the paths in the latest version (if the id is part of that) in order -# to know whether we need to replace rather than just upload a file, and we need -# to know whether an id is released, since that implies we can't replace it -# (but we could change the metadata, right?) and we can't actually delete it. -# The latter meaning: It can be removed from the new DRAFT version, but it's -# still available via its id from an older version of the dataverse dataset. -# This namedtuple is meant to be the value type of a dict with ids as its keys: -FileIdRecord = namedtuple("FileIdRecord", ["path", "is_released"]) - -# Needed to determine whether RENAMEEXPORT can be considered implemented. -CURL_EXISTS = which('curl') is not None +from .baseremote import ( + DataverseRemote as BaseDataverseRemote, + FileIdRecord, + CURL_EXISTS, +) class DataverseRemote(ExportRemote, BaseDataverseRemote): @@ -131,131 +113,6 @@ class DataverseRemote(ExportRemote, BaseDataverseRemote): having smaller but possibly much more requests is likely a lot more expensive. """ - - def __init__(self, *args): - super().__init__(*args) - self.configs['url'] = 'The Dataverse URL for the remote' - self.configs['doi'] = 'DOI to the dataset' - self.configs['dlacredential'] = \ - 'Identifier used to retrieve an API token from a local ' \ - 'credential store' - # dataverse dataset identifier - self._doi = None - # dataverse instance URL - self._url = None - # dataverse native API handle - self._api = None - self._data_access_api = None - self._token = None - self._old_dataset_versions = None - self._dataset_latest = None - self._files_old = None - self._files_latest = None - self.is_draft = None - - # - # Essential API - # - def prepare(self): - # remove any trailing slash from URL - url = self.annex.getconfig('url').rstrip('/') - if not url: - raise ValueError('url must be specified') - doi = self.annex.getconfig('doi') - if not doi: - raise ValueError('doi must be specified') - # standardize formating to minimize complexity downstream - self._doi = format_doi(doi) - self._url = url - # we need an acces token, use the repo's configmanager to - # query for one - repo = AnnexRepo(self.annex.getgitdir()) - # TODO the below is almost literally taken from - # the datalad-annex:: implementation in datalad-next - # this could become a comming helper - # TODO https://github.com/datalad/datalad-dataverse/issues/171 - credman = CredentialManager(repo.config) - credential_name = self.annex.getconfig('dlacredential') - api = get_api( - self._url, - credman, - credential_name=credential_name, - ) - # store for reuse with data access API. - # we do not initialize that one here, because it is only used - # for file downloads - self._token = api.api_token - self._api = api - - def initremote(self): - """ - Use this command to initialize a remote - git annex initremote dv1 type=external externaltype=dataverse encryption=none - """ - # we also need an active API connection for initremote, - # simply run prepare() - self.prepare() - # check if instance is readable and authenticated - resp = self._api.get_info_version() - if resp.json()['status'] != 'OK': - raise RuntimeError(f'Cannot connect to dataverse instance ' - f'(status: {resp.json()["status"]})') - - # check if project with specified doi exists - dv_ds = self._api.get_dataset(identifier=self._doi) - if not dv_ds.ok: - raise RuntimeError("Cannot find dataset") - - def checkpresent(self, key): - stored_id = self._get_annex_fileid_record(key) - if stored_id is not None: - # First, check latest version. Second, check older versions. - # This is to avoid requesting the full file list unless necessary. - return stored_id in self.files_latest.keys() or \ - stored_id in self.files_old.keys() - else: - # We do not have an ID on record for this key. - # Fall back to filename matching for two reasons: - # 1. We have to deal with the special keys of the datalad-annex - # git-remote-helper. They must be matched by name, since the - # throwaway repo using them doesn't have a relevant git-annex - # branch with an ID record (especially when cloning via the - # git-remote-helper) - # 2. We are in "regular annex mode" here - keys are stored under - # their name. Falling back to name matching allows to recover - # data, despite a lost or not generated id record for it. For - # example on could have uploaded lots of data via git-annex-copy, - # but failed to push the git-annex branch somewhere. - return Path(key) in [f.path for f in self.files_latest.values()] or \ - Path(key) in [f.path for f in self.files_old.values()] - - def transfer_store(self, key, local_file): - datafile = Datafile() - datafile.set({'filename': key, 'label': key}) - datafile.set({'pid': self._doi}) - - self._upload_file(datafile=datafile, - key=key, - local_file=local_file, - remote_file=Path(key)) - - def transfer_retrieve(self, key, file): - stored_id = self._get_annex_fileid_record(key) - if stored_id is not None: - file_id = stored_id - else: - # Like in `self.checkpresent`, we fall back to path matching. - # Delayed checking for availability from old versions is included. - file_id = self._get_fileid_from_key(key, latest_only=False) - if file_id is None: - raise RemoteError(f"Key {key} unavailable") - - self._download_file(file_id, file) - - def remove(self, key): - remote_file = Path(key) - self._remove_file(key, remote_file) - # # Export API # From 81e361c08376456a1d3129de3ead9c0628ffc849 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 12:00:35 +0100 Subject: [PATCH 3/6] Deduplicate helpers, where association is clear and no change is required --- datalad_dataverse/baseremote.py | 14 -- datalad_dataverse/remote.py | 234 -------------------------------- 2 files changed, 248 deletions(-) diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py index a981afe..d152ae5 100644 --- a/datalad_dataverse/baseremote.py +++ b/datalad_dataverse/baseremote.py @@ -42,12 +42,6 @@ class DataverseRemote(SpecialRemote): """Special remote to interface dataverse datasets. - There are two modes of operation: - - "Regular" special remote (configured with exporttree=false; default), - where a flat ist of annex keys is put into that dataverse dataset and - - export remote (configured with exporttree=yes), where the actual worktree - is put into the dataverse dataset - Dataverse --------- @@ -93,14 +87,6 @@ class DataverseRemote(SpecialRemote): to utilize knowledge about the database ID, so the idea is to use path matching only as a fallback. - Export remote - ------------- - - In export mode the special remote can not conclude the annex key from a - remote path in general. In order to be able to access versions of a file - that are not part of the latest version (draft or not) of the dataverse - dataset, reliance on recorded database IDs is crucial. - Implementation note ------------------- diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index fb5f82b..48f1d4c 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -2,10 +2,8 @@ from pathlib import Path import re -import sys from annexremote import ExportRemote -from pyDataverse.api import DataAccessApi from pyDataverse.models import Datafile from requests import delete from requests.auth import HTTPBasicAuth @@ -22,7 +20,6 @@ from .baseremote import ( DataverseRemote as BaseDataverseRemote, - FileIdRecord, CURL_EXISTS, ) @@ -205,229 +202,6 @@ def renameexport(self, key, filename, new_filename): # # Helpers # - @property - def data_access_api(self): - if self._data_access_api is None: - self._data_access_api = DataAccessApi( - base_url=self._url, - # this relies on having established the NativeApi in prepare() - api_token=self._token, - ) - return self._data_access_api - - @property - def old_dataset_versions(self): - """Full JSON record of the dataverse dataset. - - This is requested once when relevant to look for a key that is not - present in the latest version of the dataverse dataset. In such case, - `files_old` is build from it. - """ - - if self._old_dataset_versions is None: - # This delivers a full record of all known versions of this dataset. - # Hence, the file lists in the version entries may contain - # duplicates (unchanged files across versions). - self.message("Request all dataset versions", type='debug') - versions = self._api.get_dataset_versions(self._doi) - versions.raise_for_status() - - self._old_dataset_versions = versions.json()['data'] - # Expected structure in self._dataset is a list of (version-) - # dictionaries, which should have a field 'files'. This again is a - # list of dicts like this: - # {'description': '', - # 'label': 'third_file.md', - # 'restricted': False, - # 'directoryLabel': 'subdir2', - # 'version': 1, - # 'datasetVersionId': 72, - # 'dataFile': {'id': 682, - # 'persistentId': '', - # 'pidURL': '', - # 'filename': 'third_file.md', - # 'contentType': 'text/plain', - # 'filesize': 9, - # 'description': '', - # 'storageIdentifier': 'local://1821bc70e68-c3c9dedcfce6', - # 'rootDataFileId': -1, - # 'md5': 'd8d77109f4a24efc3bd53d7cabb7ee35', - # 'checksum': {'type': 'MD5', - # 'value': 'd8d77109f4a24efc3bd53d7cabb7ee35'}, - # 'creationDate': '2022-07-20'} - - # Sort by version, so we can rely on the last entry to refer to the - # latest version. - # Note, that ('versionNumber', 'versionMinorNumber', 'versionState') - # would look like this: - # (None, None, 'DRAFT'), (2, 0, 'RELEASED'), (1, 0, 'RELEASED') - # and we need a possible DRAFT to have the greatest key WRT sorting. - self._old_dataset_versions.sort( - key=lambda v: (v.get('versionNumber') or sys.maxsize, - v.get('versionMinorNumber') or sys.maxsize), - reverse=False) - # Remove "latest" - we already have that - self._old_dataset_versions = self._old_dataset_versions[:-1] - - return self._old_dataset_versions - - @property - def dataset_latest(self): - """JSON representation on the latest version of the dataverse dataset. - - This is used to initialize `files_latest` and only requested once. - """ - - if self._dataset_latest is None: - self.message("Request latest dataset version", type='debug') - dataset = self._api.get_dataset( - identifier=self._doi, - version=":latest", - ) - dataset.raise_for_status() - self._dataset_latest = dataset.json()['data']['latestVersion'] - return self._dataset_latest - - @property - def files_old(self): - """Files available from older dataverse dataset versions. - - For quick lookup and deduplication, this is a dict {id: FileIdRecord} - """ - - if self._files_old is None: - self._files_old = { - f['dataFile']['id']: FileIdRecord( - Path(f.get('directoryLabel', '')) / f['dataFile']['filename'], - True # older versions are always released - ) - for file_lists in [ - (version['files'], version['versionState']) - for version in self.old_dataset_versions - ] - for f in file_lists[0] - } - - return self._files_old - - @property - def files_latest(self): - """Cache of files in the latest version of the dataverse dataset. - - This refers to the DRAFT version (if there is any) or the latest - published version otherwise. That's the version pushes go into. Hence, - this is needed to determine whether we need and can replace/remove a - file, while the complete list in `self.files_old` is relevant for key - retrieval of keys that are not present in the latest version anymore. - - Note, that whie initially we may not be in a draft, we are as soon as we - change things (upload/repace/remove/rename). We keep track of those - changes herein w/o rerequesting the new state. - """ - - if self._files_latest is None: - # Latest version in self.dataset is first entry. - self._files_latest = { - f['dataFile']['id']: FileIdRecord( - Path(f.get('directoryLabel', '')) / f['dataFile']['filename'], - self.dataset_latest['versionState'] == "RELEASED", - ) - for f in self.dataset_latest['files'] - } - - return self._files_latest - - def remove_from_filelist(self, id): - """Update self.files_latest after removal""" - # make sure this property actually exists before assigning: - # (This may happen when git-annex-export decides to remove a key w/o - # even considering checkpresent) - self.files_latest - self._files_latest.pop(id, None) - - def add_to_filelist(self, d): - """Update self.files_latest after upload - - d: dict - dataverse description dict of the file; this dict is in the list - 'data.files' of the response to a successful upload - """ - # make sure this property actually exists before assigning: - # (This may happen on `git-annex-copy --fast`) - self.files_latest - - self._files_latest[d['dataFile']['id']] = FileIdRecord( - Path(d.get('directoryLabel', '')) / d['dataFile']['filename'], - False # We just added - it can't be released - ) - - def _get_annex_fileid_record(self, key: str) -> int | None: - """Get the dataverse database id from the git-annex branch - - This is using the getstate/setstate special remote feature. Hence, a - stored id only exists, if the key was put to the dataverse instance by - this special remote. - - Parameters - ---------- - key: str - annex key to retrieve the id for - - Returns - ------- - int or None - """ - stored_id = self.annex.getstate(key) - if stored_id == "": - return None - else: - return int(stored_id) - - def _set_annex_fileid_record(self, key, id): - """Store a dataverse database id for a given key - - Parameters - ---------- - key: str - annex key to store the id for - id: int or str - dataverse database id for `key`. Empty string to unset. - """ - self.annex.setstate(key, str(id)) - - def _get_fileid_from_key(self, - key: str, - *, - latest_only: bool) -> int | None: - """Get the id of a dataverse file, that matches a given annex key - dataverse dataset. - - This method assumes that keys are deposited under paths that are - identical to the key name. - - Parameters - ---------- - key: - Annex key to perform the lookup for - latest_only: bool - Whether to only consider the latest version on dataverse. If - `False`, matching against older versions will only be performed - when there was no match in the latest version (implies that an - additional request may be performed) - - Returns - ------- - int or None - """ - # for now this also just performs a look-up by path - # but if other metadata-based lookups become possible - # this implementation could change - # https://github.com/datalad/datalad-dataverse/issues/188 - return self._get_fileid_from_exportpath( - Path(key), - latest_only=latest_only, - ) - def _get_fileid_from_exportpath(self, path: Path, *, @@ -521,14 +295,6 @@ def _upload_file(self, datafile, key, local_file, remote_file): # remember dataverse's database id for this key self._set_annex_fileid_record(key, uploaded_file['dataFile']['id']) - def _download_file(self, file_id, local_file): - """helper for both transfer-retrieve methods""" - response = self.data_access_api.get_datafile(file_id) - # http error handling - response.raise_for_status() - with open(local_file, "wb") as f: - f.write(response.content) - def _remove_file(self, key, remote_file): """helper for both remove methods""" rm_id = self._get_annex_fileid_record(key) \ From fbde6cb8a4723270c6a209ea0475c48a288fd138 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 12:29:06 +0100 Subject: [PATCH 4/6] Deduplicate `_remove_file()` helper The necessary `fileId` is no determined appropriately for each mode, and then a joint helper in the base implementation is called. Closes #183 --- datalad_dataverse/baseremote.py | 16 ++++++----- datalad_dataverse/remote.py | 50 ++------------------------------- 2 files changed, 12 insertions(+), 54 deletions(-) diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py index d152ae5..df54ebd 100644 --- a/datalad_dataverse/baseremote.py +++ b/datalad_dataverse/baseremote.py @@ -233,8 +233,9 @@ def transfer_retrieve(self, key, file): self._download_file(file_id, file) def remove(self, key): - remote_file = Path(key) - self._remove_file(key, remote_file) + rm_id = self._get_annex_fileid_record(key) \ + or self._get_fileid_from_key(key, latest_only=True) + self._remove_file(key, rm_id) # # Helpers @@ -563,13 +564,14 @@ def _download_file(self, file_id, local_file): with open(local_file, "wb") as f: f.write(response.content) - def _remove_file(self, key, remote_file): - """helper for both remove methods""" - rm_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(remote_file, latest_only=True) + def _remove_file(self, key: str, rm_id: int | None): + """Remove a file by dataverse fileId + It is OK to call this method even when there is ``None``. It anticipates + this case in order to provide uniform handling. + """ if rm_id is None: - # We didn't find anything to remove. That should be fine and + # We don't have anything to remove. That should be fine and # considered a successful removal by git-annex. return if rm_id not in self.files_latest.keys(): diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index 48f1d4c..9bcf615 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -158,7 +158,9 @@ def transferexport_retrieve(self, key, local_file, remote_file): def removeexport(self, key, remote_file): remote_file = mangle_directory_names(remote_file) - self._remove_file(key, remote_file) + rm_id = self._get_annex_fileid_record(key) \ + or self._get_fileid_from_exportpath(remote_file, latest_only=True) + self._remove_file(key, rm_id) def renameexport(self, key, filename, new_filename): """Moves an exported file. @@ -295,52 +297,6 @@ def _upload_file(self, datafile, key, local_file, remote_file): # remember dataverse's database id for this key self._set_annex_fileid_record(key, uploaded_file['dataFile']['id']) - def _remove_file(self, key, remote_file): - """helper for both remove methods""" - rm_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(remote_file, latest_only=True) - - if rm_id is None: - # We didn't find anything to remove. That should be fine and - # considered a successful removal by git-annex. - return - if rm_id not in self.files_latest.keys(): - # We can't remove from older (hence published) versions. - return - - status = delete( - f'{self._url}/dvn/api/data-deposit/v1.1/swordv2/' - f'edit-media/file/{rm_id}', - # this relies on having established the NativeApi in prepare() - auth=HTTPBasicAuth(self._token, '')) - # http error handling - status.raise_for_status() - # We need to figure whether the removed ID was part of a released - # version. In that case it's still retrievable from an old, published - # version. - # Note, that this would potentially trigger the request of the full - # file list (`self.files_old`). - if not (self.files_latest[rm_id].is_released or - rm_id in self.files_old.keys()): - self.message(f"Unset stored id for {key}", type='debug') - self._set_annex_fileid_record(key, "") - else: - # Despite not actually deleting from the dataverse database, we - # currently loose access to the old key (in export mode, that is), - # because annex registers a successful REMOVEEXPORT and there seems - # to be no way to make annex even try to run a CHECKPRESENT(-EXPORT) - # on an export remote in such case. get, fsck, checkpresentkey - - # none of them would do. - # TODO: We could try to setpresenturl for the not-really-removed - # file, if it has a persistent URL (should be findable in - # self.old_dataset_versions) or even via api/access/datafile/811. - # However, that depends on permissions, etc., so not clear it's - # useful or desireable to always do that. - # Otherwise not seeing a solution ATM. See https://github.com/datalad/datalad-dataverse/issues/146#issuecomment-1214409351 - pass - # This ID is not part of the latest version anymore. - self.remove_from_filelist(rm_id) - def main(): """cmdline entry point""" From 26daceaffbef0ab2f7272193f8fe82ba235b38ca Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 12:43:34 +0100 Subject: [PATCH 5/6] Deduplicate `_upload_file()` helper --- datalad_dataverse/baseremote.py | 36 ++++++++-------- datalad_dataverse/remote.py | 75 ++++----------------------------- 2 files changed, 27 insertions(+), 84 deletions(-) diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py index df54ebd..8a1c71f 100644 --- a/datalad_dataverse/baseremote.py +++ b/datalad_dataverse/baseremote.py @@ -214,10 +214,18 @@ def transfer_store(self, key, local_file): datafile.set({'filename': key, 'label': key}) datafile.set({'pid': self._doi}) - self._upload_file(datafile=datafile, - key=key, - local_file=local_file, - remote_file=Path(key)) + # If the remote path already exists, we need to replace rather than + # upload the file, since otherwise dataverse would rename the file on + # its end. However, this only concerns the latest version of the + # dataset (which is what we are pushing into)! + replace_id = self._get_fileid_from_key(key, latest_only=True) + + self._upload_file( + datafile=datafile, + key=key, + local_file=local_file, + replace_id=replace_id, + ) def transfer_retrieve(self, key, file): stored_id = self._get_annex_fileid_record(key) @@ -490,16 +498,10 @@ def _get_fileid_from_exportpath(self, if f.path == path] return existing_id[0] if existing_id else None - def _upload_file(self, datafile, key, local_file, remote_file): + def _upload_file(self, datafile, key, local_file, replace_id): """helper for both transfer-store methods""" - # If the remote path already exists, we need to replace rather than - # upload the file, since otherwise dataverse would rename the file on - # its end. However, this only concerns the latest version of the dataset - # (which is what we are pushing into)! - replace_id = self._get_fileid_from_exportpath( - remote_file, latest_only=True) if replace_id is not None: - self.message(f"Replacing {remote_file} ...", type='debug') + self.message(f"Replacing fileId {replace_id} ...", type='debug') response = self._api.replace_datafile( identifier=replace_id, filename=local_file, @@ -507,7 +509,7 @@ def _upload_file(self, datafile, key, local_file, remote_file): is_filepid=False, ) else: - self.message(f"Uploading {remote_file} ...", type='debug') + self.message(f"Uploading key {key} ...", type='debug') response = self._api.upload_datafile( identifier=self._doi, filename=local_file, @@ -542,12 +544,12 @@ def _upload_file(self, datafile, key, local_file, remote_file): self.remove_from_filelist(replace_id) # In case of replace we need to figure whether the replaced # ID was part of a DRAFT version only. In that case it's gone and - # we'd need to remove the ID record. Otherwise, it's still retrieval - # from an old, published version. + # we'd need to remove the ID record. Otherwise, it's still + # retrieval from an old, published version. # Note, that this would potentially trigger the request of the full # file list (`self.files_old`). - if not (self.files_latest[replace_id].is_released or - replace_id in self.files_old.keys()): + if not (self.files_latest[replace_id].is_released \ + or replace_id in self.files_old.keys()): self._set_annex_fileid_record(key, "") uploaded_file = response.json()['data']['files'][0] diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index 9bcf615..43ce1ef 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -143,7 +143,14 @@ def transferexport_store(self, key, local_file, remote_file): 'label': remote_file.name, 'pid': self._doi}) - self._upload_file(datafile, key, local_file, remote_file) + # If the remote path already exists, we need to replace rather than + # upload the file, since otherwise dataverse would rename the file on + # its end. However, this only concerns the latest version of the + # dataset (which is what we are pushing into)! + replace_id = self._get_fileid_from_exportpath( + remote_file, latest_only=True) + + self._upload_file(datafile, key, local_file, replace_id) def transferexport_retrieve(self, key, local_file, remote_file): # In export mode, we need to fix remote paths: @@ -231,72 +238,6 @@ def _get_fileid_from_exportpath(self, if f.path == path] return existing_id[0] if existing_id else None - def _upload_file(self, datafile, key, local_file, remote_file): - """helper for both transfer-store methods""" - # If the remote path already exists, we need to replace rather than - # upload the file, since otherwise dataverse would rename the file on - # its end. However, this only concerns the latest version of the dataset - # (which is what we are pushing into)! - replace_id = self._get_fileid_from_exportpath( - remote_file, latest_only=True) - if replace_id is not None: - self.message(f"Replacing {remote_file} ...", type='debug') - response = self._api.replace_datafile( - identifier=replace_id, - filename=local_file, - json_str=datafile.json(), - is_filepid=False, - ) - else: - self.message(f"Uploading {remote_file} ...", type='debug') - response = self._api.upload_datafile( - identifier=self._doi, - filename=local_file, - json_str=datafile.json(), - ) - - if response.status_code == 400 and \ - response.json()['status'] == "ERROR" and \ - "duplicate content" in response.json()['message']: - # Ignore this one for now. - # TODO: This needs better handling. Currently, this happens in - # git-annex-testremote ("store when already present"). - # Generally it's kinda fine, but we'd better figure this out more - # reliably. Note, that we have to deal with annex keys, which are - # not hash based (for example the special keys fo datalad-annex - # git-remote-helper). - # Hence, having the key on the remote end, doesn't mean it's - # identical. So, we can't catch it beforehand this way. - self.message( - f"Failed to upload {key}, since dataverse says we are " - f"replacing with duplicate content.", type='debug' - ) - return # nothing changed and nothing needs to be done - else: - response.raise_for_status() - - # Success. - - # If we replaced, `replaced_id` is not part of the latest version - # anymore. - if replace_id is not None: - self.remove_from_filelist(replace_id) - # In case of replace we need to figure whether the replaced - # ID was part of a DRAFT version only. In that case it's gone and - # we'd need to remove the ID record. Otherwise, it's still retrieval - # from an old, published version. - # Note, that this would potentially trigger the request of the full - # file list (`self.files_old`). - if not (self.files_latest[replace_id].is_released or - replace_id in self.files_old.keys()): - self._set_annex_fileid_record(key, "") - - uploaded_file = response.json()['data']['files'][0] - # update cache: - self.add_to_filelist(uploaded_file) - # remember dataverse's database id for this key - self._set_annex_fileid_record(key, uploaded_file['dataFile']['id']) - def main(): """cmdline entry point""" From b5f722801d1117a0b6a3aa3ab5eac1594e15d9ff Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 10 Mar 2023 13:17:56 +0100 Subject: [PATCH 6/6] Complete separation of Export and Non-Export functionality Closes #177 --- datalad_dataverse/baseremote.py | 16 +++++++------ datalad_dataverse/remote.py | 41 ++++----------------------------- 2 files changed, 13 insertions(+), 44 deletions(-) diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py index 8a1c71f..3ab3100 100644 --- a/datalad_dataverse/baseremote.py +++ b/datalad_dataverse/baseremote.py @@ -466,17 +466,19 @@ def _get_fileid_from_key(self, # but if other metadata-based lookups become possible # this implementation could change # https://github.com/datalad/datalad-dataverse/issues/188 - return self._get_fileid_from_exportpath( + return self._get_fileid_from_remotepath( Path(key), latest_only=latest_only, ) - def _get_fileid_from_exportpath(self, - path: Path, - *, - latest_only: bool) -> int | None: - """Get the id of a dataverse file, that matches a given `Path` in the - dataverse dataset. + def _get_fileid_from_remotepath( + self, + path: Path, + *, + latest_only: bool) -> int | None: + """Get the id of a dataverse file, that matches a given path in the + dataverse dataset. The path is interpreted as the conjunction of a + ``directoryLabel`` and a ``label`` (filename) in dataverse terminology. Parameters ---------- diff --git a/datalad_dataverse/remote.py b/datalad_dataverse/remote.py index 43ce1ef..bd96c1f 100644 --- a/datalad_dataverse/remote.py +++ b/datalad_dataverse/remote.py @@ -1,12 +1,9 @@ from __future__ import annotations -from pathlib import Path import re from annexremote import ExportRemote from pyDataverse.models import Datafile -from requests import delete -from requests.auth import HTTPBasicAuth from datalad_next.annexremotes import ( RemoteError, @@ -147,7 +144,7 @@ def transferexport_store(self, key, local_file, remote_file): # upload the file, since otherwise dataverse would rename the file on # its end. However, this only concerns the latest version of the # dataset (which is what we are pushing into)! - replace_id = self._get_fileid_from_exportpath( + replace_id = self._get_fileid_from_remotepath( remote_file, latest_only=True) self._upload_file(datafile, key, local_file, replace_id) @@ -157,7 +154,7 @@ def transferexport_retrieve(self, key, local_file, remote_file): remote_file = mangle_directory_names(remote_file) file_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(remote_file, latest_only=True) + or self._get_fileid_from_remotepath(remote_file, latest_only=True) if file_id is None: raise RemoteError(f"Key {key} unavailable") @@ -166,7 +163,7 @@ def transferexport_retrieve(self, key, local_file, remote_file): def removeexport(self, key, remote_file): remote_file = mangle_directory_names(remote_file) rm_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(remote_file, latest_only=True) + or self._get_fileid_from_remotepath(remote_file, latest_only=True) self._remove_file(key, rm_id) def renameexport(self, key, filename, new_filename): @@ -190,7 +187,7 @@ def renameexport(self, key, filename, new_filename): new_filename = mangle_directory_names(new_filename) file_id = self._get_annex_fileid_record(key) \ - or self._get_fileid_from_exportpath(filename, latest_only=True) + or self._get_fileid_from_remotepath(filename, latest_only=True) if file_id is None: raise RemoteError(f"{key} not available for renaming") @@ -208,36 +205,6 @@ def renameexport(self, key, filename, new_filename): if proc.returncode: raise RemoteError(f"Renaming failed: {proc.stderr}") - # - # Helpers - # - def _get_fileid_from_exportpath(self, - path: Path, - *, - latest_only: bool) -> int | None: - """Get the id of a dataverse file, that matches a given `Path` in the - dataverse dataset. - - Parameters - ---------- - path: Path - latest_only: bool - Whether to only consider the latest version on dataverse. If - `False`, matching against older versions will only be performed - when there was no match in the latest version (implies that an - additional request may be performed) - - Returns - ------- - int or None - """ - existing_id = [i for i, f in self.files_latest.items() - if f.path == path] - if not latest_only and not existing_id: - existing_id = [i for i, f in self.files_old.items() - if f.path == path] - return existing_id[0] if existing_id else None - def main(): """cmdline entry point"""