Skip to content

Commit

Permalink
Search: Returning CachedIrodsPath for data objects (#283)
Browse files Browse the repository at this point in the history
* Returning CachedIrodsPath for data objects

* adjust doc string

* making mypy happy

* making ruff happy

* Use only CachedIrodsPaths

* Add own representation string to CachedIrodsPath for clarity

* update docs

* Update ibridges/search.py

Co-authored-by: qubixes <[email protected]>

* fix little bug

---------

Co-authored-by: Staiger, Christine <[email protected]>
Co-authored-by: qubixes <[email protected]>
  • Loading branch information
3 people authored Nov 15, 2024
1 parent 1a8a681 commit 02e982a
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 32 deletions.
9 changes: 7 additions & 2 deletions docs/source/irods_search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ iRODS Search
============

`iBridges` offers an easy way to search for data. You can pass a combination of path, metadata,
item type and checksum. The output will be a list of :class:`ibridges.path.IrodsPath`, which contain information where to find the item on the iRODS server.
item type and checksum. The output will be a list of :class:`ibridges.path.CachedIrodsPath`, which contain information where to find the item on the iRODS server.

.. note::

Expand All @@ -29,6 +29,11 @@ To find all subcollections and dataobjects in a collection use the `%` as wildca
search_data(session, path_pattern="subcoll/%")
.. note::

The output of a search is a :class:`ibridges.path.CachedIrodsPath`. It contains the information about the data object or collection at the time of the search.
This information is not refetched from the server, i.e. the size of the path will always remain the size at the time of the search.


Search data by metadata
-----------------------
Expand Down Expand Up @@ -66,7 +71,7 @@ A query with metadata will look like:
# and one metadata entry that has value=="value", but they do not have to be
# for the same entry as in the above.
search_data(session, metadata=[MetaSearch(key="key"), MetaSearch(value="value")])
Use the `%` as a wild card again to match any combination of characters.


Expand Down
1 change: 1 addition & 0 deletions ibridges/icat_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
DATA_PATH = imodels.DataObject.path
DATA_ID = imodels.DataObject.id
DATA_CHECKSUM = imodels.DataObject.checksum
DATA_SIZE = imodels.DataObject.size
META_COLL_ATTR_NAME = imodels.CollectionMeta.name
META_COLL_ATTR_VALUE = imodels.CollectionMeta.value
META_COLL_ATTR_UNITS = imodels.CollectionMeta.units
Expand Down
4 changes: 4 additions & 0 deletions ibridges/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,10 @@ def checksum(self) -> str:
return super().checksum
return self._checksum

def __repr__(self) -> str:
"""Representation of the CachedIrodsPath object in line with a Path object."""
return f"CachedIrodsPath({', '.join(self._path.parts)})"

def dataobject_exists(self) -> bool:
"""See IrodsPath."""
return self._is_dataobj
Expand Down
73 changes: 43 additions & 30 deletions ibridges/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from __future__ import annotations

from collections import namedtuple
from typing import Optional, Union
from typing import List, Optional, Union

from ibridges import icat_columns as icat
from ibridges.path import IrodsPath
from ibridges.path import CachedIrodsPath, IrodsPath
from ibridges.session import Session

META_COLS = {
"collection": (icat.META_COLL_ATTR_NAME, icat.META_COLL_ATTR_VALUE, icat.META_COLL_ATTR_UNITS),
"data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS)
"data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS),
}


Expand All @@ -28,8 +28,9 @@ class MetaSearch(namedtuple("MetaSearch", ["key", "value", "units"], defaults=[.
def __new__(cls, key=..., value=..., units=...):
"""Create a new MetaSearch object."""
if key is ... and value is ... and units is ...:
raise ValueError("Cannot create metasearch without specifying either key, value or "
"units.")
raise ValueError(
"Cannot create metasearch without specifying either key, value or units."
)
key = "%" if key is ... else key
value = "%" if value is ... else value
units = "%" if units is ... else units
Expand All @@ -43,8 +44,8 @@ def search_data( # pylint: disable=too-many-branches
checksum: Optional[str] = None,
metadata: Union[None, MetaSearch, list[MetaSearch], list[tuple]] = None,
item_type: Optional[str] = None,
case_sensitive: bool = False
) -> list[IrodsPath]:
case_sensitive: bool = False,
) -> list[CachedIrodsPath]:
"""Search for collections, data objects and metadata.
By default all accessible collections and data objects are returned.
Expand Down Expand Up @@ -83,11 +84,9 @@ def search_data( # pylint: disable=too-many-branches
Returns
-------
List of dictionaries with keys:
COLL_NAME (absolute path of the collection),
DATA_NAME (name of the data object),
D_DATA_CHECKSUM (checksum of the data object)
The latter two keys are only present of the found item is a data object.
List of CachedIrodsPaths.
The CachedIrodsPaths for data objects contain the size and the checksum
found in the search.
Examples
--------
Expand Down Expand Up @@ -156,23 +155,29 @@ def search_data( # pylint: disable=too-many-branches
queries.append((coll_query, "collection"))
if item_type != "collection":
# create the query for data objects; we need the collection name, the data name and checksum
data_query = session.irods_session.query(icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
case_sensitive=case_sensitive)
data_query = session.irods_session.query(
icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
icat.DATA_SIZE,
case_sensitive=case_sensitive,
)
data_query = data_query.filter(icat.LIKE(icat.COLL_NAME, _postfix_wildcard(path)))
queries.append((data_query, "data_object"))

data_name_query = session.irods_session.query(icat.COLL_NAME, icat.DATA_NAME,
icat.DATA_CHECKSUM,
case_sensitive=case_sensitive)
data_name_query = session.irods_session.query(
icat.COLL_NAME,
icat.DATA_NAME,
icat.DATA_CHECKSUM,
icat.DATA_SIZE,
case_sensitive=case_sensitive,
)
data_name_query.filter(icat.LIKE(icat.COLL_NAME, f"{path}"))
queries.append((data_name_query, "data_object"))

if path_pattern is not None:
_path_filter(path_pattern, queries)


for mf in metadata:
_meta_filter(mf, queries)

Expand All @@ -184,23 +189,28 @@ def search_data( # pylint: disable=too-many-branches
query_results.extend(list(q[0]))

# gather results, data_query and data_name_query can contain the same results
results = [
dict(s) for s in set(frozenset(d.items())
for d in query_results)
]
results = [dict(s) for s in set(frozenset(d.items()) for d in query_results)]
for item in results:
if isinstance(item, dict):
new_keys = [k.icat_key for k in item.keys()]
for n_key, o_key in zip(new_keys, item.keys()):
item[n_key] = item.pop(o_key)
for meta_key in list(item.keys()):
item[meta_key.icat_key] = item.pop(meta_key)

# Convert the results to IrodsPath objects.
ipath_results = []
ipath_results: List[CachedIrodsPath] = []
for res in results:
if "DATA_NAME" in res:
ipath_results.append(IrodsPath(session, res["COLL_NAME"], res["DATA_NAME"]))
ipath_results.append(
CachedIrodsPath(
session,
res["DATA_SIZE"],
True,
res["D_DATA_CHECKSUM"],
res["COLL_NAME"],
res["DATA_NAME"],
)
)
else:
ipath_results.append(IrodsPath(session, res["COLL_NAME"]))
ipath_results.append(CachedIrodsPath(session, None, False, None, res["COLL_NAME"]))
return ipath_results


Expand All @@ -209,11 +219,13 @@ def _prefix_wildcard(pattern):
return pattern
return f"%/{pattern}"


def _postfix_wildcard(path):
if str(path).endswith("/"):
return f"{path}%"
return f"{path}/%"


def _path_filter(path_pattern, queries):
for q, q_type in queries:
if q_type == "collection":
Expand All @@ -233,6 +245,7 @@ def _meta_filter(metadata, queries):
for i_elem, elem in enumerate(MetaSearch(*metadata)):
q.filter(icat.LIKE(META_COLS[q_type][i_elem], elem))


def _checksum_filter(checksum, queries):
for q, q_type in queries:
if q_type == "data_object":
Expand Down

0 comments on commit 02e982a

Please sign in to comment.