Search: Returning CachedIrodsPath for data objects (#283)

* Returning CachedIrodsPath for data objects * adjust doc string * making mypy happy * making ruff happy * Use only CachedIrodsPaths * Add own representation string to CachedIrodsPath for clarity * update docs * Update ibridges/search.py Co-authored-by: qubixes <[email protected]> * fix little bug --------- Co-authored-by: Staiger, Christine <[email protected]> Co-authored-by: qubixes <[email protected]>
iBridges-for-iRODS · Nov 15, 2024 · 02e982a · 02e982a
1 parent 1a8a681
commit 02e982a
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 32 deletions.
diff --git a/docs/source/irods_search.rst b/docs/source/irods_search.rst
@@ -2,7 +2,7 @@ iRODS Search
 ============
 
 `iBridges` offers an easy way to search for data. You can pass a combination of path, metadata,
-item type and checksum. The output will be a list of :class:`ibridges.path.IrodsPath`, which contain information where to find the item on the iRODS server.
+item type and checksum. The output will be a list of :class:`ibridges.path.CachedIrodsPath`, which contain information where to find the item on the iRODS server.
 
 .. note::
 
@@ -29,6 +29,11 @@ To find all subcollections and dataobjects in a collection use the `%` as wildca
 
     search_data(session, path_pattern="subcoll/%")
   	
+.. note::
+
+    The output of a search is a :class:`ibridges.path.CachedIrodsPath`. It contains the information about the data object or collection at the time of the search.
+    This information is not refetched from the server, i.e. the size of the path will always remain the size at the time of the search. 
+
 
 Search data by metadata
 -----------------------
@@ -66,7 +71,7 @@ A query with metadata will look like:
 	# and one metadata entry that has value=="value", but they do not have to be
 	# for the same entry as in the above.
 	search_data(session, metadata=[MetaSearch(key="key"), MetaSearch(value="value")])
-	
+
 Use the `%` as a wild card again to match any combination of characters.
 
 

diff --git a/ibridges/icat_columns.py b/ibridges/icat_columns.py
@@ -10,6 +10,7 @@
 DATA_PATH = imodels.DataObject.path
 DATA_ID = imodels.DataObject.id
 DATA_CHECKSUM = imodels.DataObject.checksum
+DATA_SIZE = imodels.DataObject.size
 META_COLL_ATTR_NAME = imodels.CollectionMeta.name
 META_COLL_ATTR_VALUE = imodels.CollectionMeta.value
 META_COLL_ATTR_UNITS = imodels.CollectionMeta.units

diff --git a/ibridges/path.py b/ibridges/path.py
@@ -620,6 +620,10 @@ def checksum(self) -> str:
             return super().checksum
         return self._checksum
 
+    def __repr__(self) -> str:
+        """Representation of the CachedIrodsPath object in line with a Path object."""
+        return f"CachedIrodsPath({', '.join(self._path.parts)})"
+
     def dataobject_exists(self) -> bool:
         """See IrodsPath."""
         return self._is_dataobj

diff --git a/ibridges/search.py b/ibridges/search.py
@@ -3,15 +3,15 @@
 from __future__ import annotations
 
 from collections import namedtuple
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 from ibridges import icat_columns as icat
-from ibridges.path import IrodsPath
+from ibridges.path import CachedIrodsPath, IrodsPath
 from ibridges.session import Session
 
 META_COLS = {
     "collection": (icat.META_COLL_ATTR_NAME, icat.META_COLL_ATTR_VALUE, icat.META_COLL_ATTR_UNITS),
-    "data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS)
+    "data_object": (icat.META_DATA_ATTR_NAME, icat.META_DATA_ATTR_VALUE, icat.META_DATA_ATTR_UNITS),
 }
 
 
@@ -28,8 +28,9 @@ class MetaSearch(namedtuple("MetaSearch", ["key", "value", "units"], defaults=[.
     def __new__(cls, key=..., value=..., units=...):
         """Create a new MetaSearch object."""
         if key is ... and value is ... and units is ...:
-            raise ValueError("Cannot create metasearch without specifying either key, value or "
-                             "units.")
+            raise ValueError(
+                "Cannot create metasearch without specifying either key, value or units."
+            )
         key = "%" if key is ... else key
         value = "%" if value is ... else value
         units = "%" if units is ... else units
@@ -43,8 +44,8 @@ def search_data(  # pylint: disable=too-many-branches
     checksum: Optional[str] = None,
     metadata: Union[None, MetaSearch, list[MetaSearch], list[tuple]] = None,
     item_type: Optional[str] = None,
-    case_sensitive: bool = False
-) -> list[IrodsPath]:
+    case_sensitive: bool = False,
+) -> list[CachedIrodsPath]:
     """Search for collections, data objects and metadata.
 
     By default all accessible collections and data objects are returned.
@@ -83,11 +84,9 @@ def search_data(  # pylint: disable=too-many-branches
 
     Returns
     -------
-        List of dictionaries with keys:
-        COLL_NAME (absolute path of the collection),
-        DATA_NAME (name of the data object),
-        D_DATA_CHECKSUM (checksum of the data object)
-        The latter two keys are only present of the found item is a data object.
+        List of CachedIrodsPaths.
+        The CachedIrodsPaths for data objects contain the size and the checksum
+        found in the search.
 
     Examples
     --------
@@ -156,23 +155,29 @@ def search_data(  # pylint: disable=too-many-branches
         queries.append((coll_query, "collection"))
     if item_type != "collection":
         # create the query for data objects; we need the collection name, the data name and checksum
-        data_query = session.irods_session.query(icat.COLL_NAME,
-                                                 icat.DATA_NAME,
-                                                 icat.DATA_CHECKSUM,
-                                                 case_sensitive=case_sensitive)
+        data_query = session.irods_session.query(
+            icat.COLL_NAME,
+            icat.DATA_NAME,
+            icat.DATA_CHECKSUM,
+            icat.DATA_SIZE,
+            case_sensitive=case_sensitive,
+        )
         data_query = data_query.filter(icat.LIKE(icat.COLL_NAME, _postfix_wildcard(path)))
         queries.append((data_query, "data_object"))
 
-        data_name_query = session.irods_session.query(icat.COLL_NAME, icat.DATA_NAME,
-                                                      icat.DATA_CHECKSUM,
-                                                      case_sensitive=case_sensitive)
+        data_name_query = session.irods_session.query(
+            icat.COLL_NAME,
+            icat.DATA_NAME,
+            icat.DATA_CHECKSUM,
+            icat.DATA_SIZE,
+            case_sensitive=case_sensitive,
+        )
         data_name_query.filter(icat.LIKE(icat.COLL_NAME, f"{path}"))
         queries.append((data_name_query, "data_object"))
 
     if path_pattern is not None:
         _path_filter(path_pattern, queries)
 
-
     for mf in metadata:
         _meta_filter(mf, queries)
 
@@ -184,23 +189,28 @@ def search_data(  # pylint: disable=too-many-branches
         query_results.extend(list(q[0]))
 
     # gather results, data_query and data_name_query can contain the same results
-    results = [
-        dict(s) for s in set(frozenset(d.items())
-                for d in query_results)
-    ]
+    results = [dict(s) for s in set(frozenset(d.items()) for d in query_results)]
     for item in results:
         if isinstance(item, dict):
-            new_keys = [k.icat_key for k in item.keys()]
-            for n_key, o_key in zip(new_keys, item.keys()):
-                item[n_key] = item.pop(o_key)
+            for meta_key in list(item.keys()):
+                item[meta_key.icat_key] = item.pop(meta_key)
 
     # Convert the results to IrodsPath objects.
-    ipath_results = []
+    ipath_results: List[CachedIrodsPath] = []
     for res in results:
         if "DATA_NAME" in res:
-            ipath_results.append(IrodsPath(session, res["COLL_NAME"], res["DATA_NAME"]))
+            ipath_results.append(
+                CachedIrodsPath(
+                    session,
+                    res["DATA_SIZE"],
+                    True,
+                    res["D_DATA_CHECKSUM"],
+                    res["COLL_NAME"],
+                    res["DATA_NAME"],
+                )
+            )
         else:
-            ipath_results.append(IrodsPath(session, res["COLL_NAME"]))
+            ipath_results.append(CachedIrodsPath(session, None, False, None, res["COLL_NAME"]))
     return ipath_results
 
 
@@ -209,11 +219,13 @@ def _prefix_wildcard(pattern):
         return pattern
     return f"%/{pattern}"
 
+
 def _postfix_wildcard(path):
     if str(path).endswith("/"):
         return f"{path}%"
     return f"{path}/%"
 
+
 def _path_filter(path_pattern, queries):
     for q, q_type in queries:
         if q_type == "collection":
@@ -233,6 +245,7 @@ def _meta_filter(metadata, queries):
         for i_elem, elem in enumerate(MetaSearch(*metadata)):
             q.filter(icat.LIKE(META_COLS[q_type][i_elem], elem))
 
+
 def _checksum_filter(checksum, queries):
     for q, q_type in queries:
         if q_type == "data_object":