Skip to content

Commit

Permalink
Sketch to support private data-proxy dataset
Browse files Browse the repository at this point in the history
This change is merely adding the ability to recognize and
process non-public dataset data-proxy URLs.

However, it is not enough to support such datasets, because
the underlying `fairgraph` query to get a dataset's file listing
returns no results.

The query is essentially this

```py
batch = omcore.File.list(
    self.client,
    file_repository=dvr,
    size=chunk_size,
    from_index=cur_index)
```

and for the dataset referenced in
#58 it returns an empty
list with

- a properly authenticated `client`
- `dvr`: `FileRepository(name='buckets/d-07ab1665-73b0-40c5-800e-557bc319109d', iri=IRI(https://data-proxy.ebrains.eu/api/v1/buckets/d-07ab1665-73b0-40c5-800e-557bc319109d)...`
- `chunk_size`: 10000
- `cur_index`: 0

With the same requesting account, I can browser-visit
https://data-proxy.ebrains.eu/datasets/07ab1665-73b0-40c5-800e-557bc319109d
and see a file listing.
  • Loading branch information
mih committed Jul 14, 2023
1 parent 2c549df commit 5fa2e01
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions datalad_ebrains/fairgraph_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,14 @@ def get_file_records(self, ds, kg_dsver):
# EBRAINS uses different file repositories that need slightly
# different handling
dvr_url_p = urlparse(dvr.iri.value)
# public data-proxy datasets
if dvr_url_p.netloc == 'data-proxy.ebrains.eu' \
and dvr_url_p.path.startswith('/api/v1/public/buckets/'):
get_fname = _get_fname_dataproxy_v1_bucket
get_fname = _get_fname_dataproxy_v1_bucket_public
# private data-proxy datasets (e.g. human data gateway)
elif dvr_url_p.netloc == 'data-proxy.ebrains.eu' \
and dvr_url_p.path.startswith('/api/v1/buckets/'):
get_fname = _get_fname_dataproxy_v1_bucket_private
elif dvr_url_p.netloc == 'object.cscs.ch' \
and dvr_url_p.query.startswith('prefix='):
# get the repos base url by removing the query string
Expand Down Expand Up @@ -281,7 +286,7 @@ def get_agent_info(self, kg_dsver):
}


def _get_fname_dataproxy_v1_bucket(f):
def _get_fname_dataproxy_v1_bucket_public(f):
f_url_p = urlparse(f.iri.value)
assert f_url_p.netloc == 'data-proxy.ebrains.eu'
assert f_url_p.path.startswith('/api/v1/public/buckets/')
Expand All @@ -290,6 +295,15 @@ def _get_fname_dataproxy_v1_bucket(f):
return Path(*path.parts[6:])


def _get_fname_dataproxy_v1_bucket_private(f):
f_url_p = urlparse(f.iri.value)
assert f_url_p.netloc == 'data-proxy.ebrains.eu'
assert f_url_p.path.startswith('/api/v1/buckets/')
path = PurePosixPath(f_url_p.path)
# take everything past the bucket_id and turn into a Platform native path
return Path(*path.parts[5:])


def _get_fname_cscs_repo(baseurl, prefix, f):
f_url = f.iri.value
# we presently have no better way to determine a relative file path
Expand Down

0 comments on commit 5fa2e01

Please sign in to comment.