Sketch to support private data-proxy dataset

This change is merely adding the ability to recognize and process non-public dataset data-proxy URLs. However, it is not enough to support such datasets, because the underlying `fairgraph` query to get a dataset's file listing returns no results. The query is essentially this ```py batch = omcore.File.list( self.client, file_repository=dvr, size=chunk_size, from_index=cur_index) ``` and for the dataset referenced in #58 it returns an empty list with - a properly authenticated `client` - `dvr`: `FileRepository(name='buckets/d-07ab1665-73b0-40c5-800e-557bc319109d', iri=IRI(https://data-proxy.ebrains.eu/api/v1/buckets/d-07ab1665-73b0-40c5-800e-557bc319109d)...` - `chunk_size`: 10000 - `cur_index`: 0 With the same requesting account, I can browser-visit https://data-proxy.ebrains.eu/datasets/07ab1665-73b0-40c5-800e-557bc319109d and see a file listing.
datalad · Jul 14, 2023 · 5fa2e01 · 5fa2e01
1 parent 2c549df
commit 5fa2e01
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/datalad_ebrains/fairgraph_query.py b/datalad_ebrains/fairgraph_query.py
@@ -186,9 +186,14 @@ def get_file_records(self, ds, kg_dsver):
         # EBRAINS uses different file repositories that need slightly
         # different handling
         dvr_url_p = urlparse(dvr.iri.value)
+        # public data-proxy datasets
         if dvr_url_p.netloc == 'data-proxy.ebrains.eu' \
                 and dvr_url_p.path.startswith('/api/v1/public/buckets/'):
-            get_fname = _get_fname_dataproxy_v1_bucket
+            get_fname = _get_fname_dataproxy_v1_bucket_public
+        # private data-proxy datasets (e.g. human data gateway)
+        elif dvr_url_p.netloc == 'data-proxy.ebrains.eu' \
+                and dvr_url_p.path.startswith('/api/v1/buckets/'):
+            get_fname = _get_fname_dataproxy_v1_bucket_private
         elif dvr_url_p.netloc == 'object.cscs.ch' \
                 and dvr_url_p.query.startswith('prefix='):
             # get the repos base url by removing the query string
@@ -281,7 +286,7 @@ def get_agent_info(self, kg_dsver):
         }
 
 
-def _get_fname_dataproxy_v1_bucket(f):
+def _get_fname_dataproxy_v1_bucket_public(f):
     f_url_p = urlparse(f.iri.value)
     assert f_url_p.netloc == 'data-proxy.ebrains.eu'
     assert f_url_p.path.startswith('/api/v1/public/buckets/')
@@ -290,6 +295,15 @@ def _get_fname_dataproxy_v1_bucket(f):
     return Path(*path.parts[6:])
 
 
+def _get_fname_dataproxy_v1_bucket_private(f):
+    f_url_p = urlparse(f.iri.value)
+    assert f_url_p.netloc == 'data-proxy.ebrains.eu'
+    assert f_url_p.path.startswith('/api/v1/buckets/')
+    path = PurePosixPath(f_url_p.path)
+    # take everything past the bucket_id and turn into a Platform native path
+    return Path(*path.parts[5:])
+
+
 def _get_fname_cscs_repo(baseurl, prefix, f):
     f_url = f.iri.value
     # we presently have no better way to determine a relative file path