Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search: Returning CachedIrodsPath for data objects #283

Merged
merged 9 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docs/source/irods_search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ iRODS Search
============

`iBridges` offers an easy way to search for data. You can pass a combination of path, metadata,
item type and checksum. The output will be a list of :class:`ibridges.path.IrodsPath`, which contain information where to find the item on the iRODS server.
item type and checksum. The output will be a list of :class:`ibridges.path.CachedIrodsPath`, which contain information where to find the item on the iRODS server.

.. note::

Expand All @@ -29,6 +29,11 @@ To find all subcollections and dataobjects in a collection use the `%` as wildca

search_data(session, path_pattern="subcoll/%")

.. note::

The output of a search is a :class:`ibridges.path.CachedIrodsPath`. It contains the information about the data object or collection at the time of the search.
This information is not refetched from the server, i.e. the size of the path will always remain the size at the time of the search.


Search data by metadata
-----------------------
Expand Down Expand Up @@ -66,7 +71,7 @@ A query with metadata will look like:
# and one metadata entry that has value=="value", but they do not have to be
# for the same entry as in the above.
search_data(session, metadata=[MetaSearch(key="key"), MetaSearch(value="value")])

Use the `%` as a wild card again to match any combination of characters.


Expand Down
1 change: 1 addition & 0 deletions ibridges/icat_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
DATA_PATH = imodels.DataObject.path
DATA_ID = imodels.DataObject.id
DATA_CHECKSUM = imodels.DataObject.checksum
DATA_SIZE = imodels.DataObject.size
META_COLL_ATTR_NAME = imodels.CollectionMeta.name
META_COLL_ATTR_VALUE = imodels.CollectionMeta.value
META_COLL_ATTR_UNITS = imodels.CollectionMeta.units
Expand Down
4 changes: 4 additions & 0 deletions ibridges/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,10 @@ def checksum(self) -> str:
return super().checksum
return self._checksum

def __repr__(self) -> str:
"""Representation of the CachedIrodsPath object in line with a Path object."""
return f"CachedIrodsPath({', '.join(self._path.parts)})"

def dataobject_exists(self) -> bool:
"""See IrodsPath."""
return self._is_dataobj
Expand Down
73 changes: 43 additions & 30 deletions ibridges/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from __future__ import annotations

from collections import namedtuple
from typing import Optional, Union
from typing import List, Optional, Union

from ibridges import icat_columns as icat
from ibridges.path import IrodsPath
from ibridges.path import CachedIrodsPath, IrodsPath
from ibridges.session import Session

META_COLS = {
"collection": (icat.META_COLL_ATTR_NAME, icat.META_COLL_ATTR_VALUE, icat.META_COLL_ATTR_UNITS),
"data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS)
"data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS),
}


Expand All @@ -28,8 +28,9 @@ class MetaSearch(namedtuple("MetaSearch", ["key", "value", "units"], defaults=[.
def __new__(cls, key=..., value=..., units=...):
"""Create a new MetaSearch object."""
if key is ... and value is ... and units is ...:
raise ValueError("Cannot create metasearch without specifying either key, value or "
"units.")
raise ValueError(
"Cannot create metasearch without specifying either key, value or units."
)
key = "%" if key is ... else key
value = "%" if value is ... else value
units = "%" if units is ... else units
Expand All @@ -43,8 +44,8 @@ def search_data( # pylint: disable=too-many-branches
checksum: Optional[str] = None,
metadata: Union[None, MetaSearch, list[MetaSearch], list[tuple]] = None,
item_type: Optional[str] = None,
case_sensitive: bool = False
) -> list[IrodsPath]:
case_sensitive: bool = False,
) -> list[CachedIrodsPath]:
"""Search for collections, data objects and metadata.

By default all accessible collections and data objects are returned.
Expand Down Expand Up @@ -83,11 +84,9 @@ def search_data( # pylint: disable=too-many-branches

Returns
-------
List of dictionaries with keys:
COLL_NAME (absolute path of the collection),
DATA_NAME (name of the data object),
D_DATA_CHECKSUM (checksum of the data object)
The latter two keys are only present of the found item is a data object.
List of CachedIrodsPaths.
The CachedIrodsPaths for data objects contain the size and the checksum
found in the search.

Examples
--------
Expand Down Expand Up @@ -156,23 +155,29 @@ def search_data( # pylint: disable=too-many-branches
queries.append((coll_query, "collection"))
if item_type != "collection":
# create the query for data objects; we need the collection name, the data name and checksum
data_query = session.irods_session.query(icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
case_sensitive=case_sensitive)
data_query = session.irods_session.query(
icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
icat.DATA_SIZE,
case_sensitive=case_sensitive,
)
data_query = data_query.filter(icat.LIKE(icat.COLL_NAME, _postfix_wildcard(path)))
queries.append((data_query, "data_object"))

data_name_query = session.irods_session.query(icat.COLL_NAME, icat.DATA_NAME,
icat.DATA_CHECKSUM,
case_sensitive=case_sensitive)
data_name_query = session.irods_session.query(
icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
icat.DATA_SIZE,
case_sensitive=case_sensitive,
)
data_name_query.filter(icat.LIKE(icat.COLL_NAME, f"{path}"))
queries.append((data_name_query, "data_object"))

if path_pattern is not None:
_path_filter(path_pattern, queries)


for mf in metadata:
_meta_filter(mf, queries)

Expand All @@ -184,23 +189,28 @@ def search_data( # pylint: disable=too-many-branches
query_results.extend(list(q[0]))

# gather results, data_query and data_name_query can contain the same results
results = [
dict(s) for s in set(frozenset(d.items())
for d in query_results)
]
results = [dict(s) for s in set(frozenset(d.items()) for d in query_results)]
for item in results:
if isinstance(item, dict):
new_keys = [k.icat_key for k in item.keys()]
for n_key, o_key in zip(new_keys, item.keys()):
item[n_key] = item.pop(o_key)
for meta_key in list(item.keys()):
item[meta_key.icat_key] = item.pop(meta_key)

# Convert the results to IrodsPath objects.
ipath_results = []
ipath_results: List[CachedIrodsPath] = []
for res in results:
if "DATA_NAME" in res:
ipath_results.append(IrodsPath(session, res["COLL_NAME"], res["DATA_NAME"]))
ipath_results.append(
CachedIrodsPath(
session,
res["DATA_SIZE"],
True,
res["D_DATA_CHECKSUM"],
res["COLL_NAME"],
res["DATA_NAME"],
)
)
else:
ipath_results.append(IrodsPath(session, res["COLL_NAME"]))
ipath_results.append(CachedIrodsPath(session, None, False, None, res["COLL_NAME"]))
return ipath_results


Expand All @@ -209,11 +219,13 @@ def _prefix_wildcard(pattern):
return pattern
return f"%/{pattern}"


def _postfix_wildcard(path):
if str(path).endswith("/"):
return f"{path}%"
return f"{path}/%"


def _path_filter(path_pattern, queries):
for q, q_type in queries:
if q_type == "collection":
Expand All @@ -233,6 +245,7 @@ def _meta_filter(metadata, queries):
for i_elem, elem in enumerate(MetaSearch(*metadata)):
q.filter(icat.LIKE(META_COLS[q_type][i_elem], elem))


def _checksum_filter(checksum, queries):
for q, q_type in queries:
if q_type == "data_object":
Expand Down
Loading