Merge pull request #8 from glencoesoftware/queries

Add query support
glencoesoftware · Nov 17, 2023 · f99cc8e · f99cc8e
2 parents a2158f5 + 29c9081
commit f99cc8e
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -165,3 +165,36 @@ with omero2pandas.OMEROConnection(server='my.server', port=4064,
 ```
 
 The context manager will handle session creation and cleanup automatically.
+
+### Querying tables
+
+You can also supply [PyTables condition syntax](https://www.pytables.org/usersguide/condition_syntax.html) to the `read_table` and `download_table` functions.
+Returned tables will only include rows which pass this filter.
+
+**Basic syntax**
+Select rows representing objects with area greater than 20:
+```python
+omero2pandas.read_table(file_id=10, query='(area>20)')
+```
+
+**Multiple conditions**
+
+Select rows representing objects with an even ID number lower than 50:
+```python
+omero2pandas.read_table(file_id=10, query='(id%2==0) & (id<50)')
+```
+
+**Complex conditions** 
+
+Select rows representing objects which originated from an ROI named 'Nucleus':
+```python
+omero2pandas.read_table(file_id=10, query='x!="Nucleus"', variables={'x': omero.rtypes.rstring('Roi Name')})
+```
+
+N.b. Column names containing spaces aren't supported by the native syntax, but can be supplied as variables which are provided by the `variables` parameter.
+
+The variables map needs to be a dictionary mapping string variables to [OMERO rtypes](https://omero.readthedocs.io/en/v5.6.9/developers/GettingStarted/AdvancedClientDevelopment.html#rtypes) objects rather than raw Python objects. 
+These should match the relevant column type. Mapped variables are substituted into the query during processing.
+
+A `variables` map usually isn't needed for simple queries. The basic condition string should automatically get converted to a meaningful type, but when this fails 
+replacing tricky elements with a variable may help.
diff --git a/omero2pandas/__init__.py b/omero2pandas/__init__.py
@@ -89,7 +89,7 @@ def get_table_columns(file_id=None, annotation_id=None,
 
 def read_table(file_id=None, annotation_id=None, column_names=(), rows=None,
                chunk_size=1000, omero_connector=None, server=None, port=4064,
-               username=None, password=None):
+               username=None, password=None, query=None, variables=None):
     """
     Gets table data from the server.
     Supply either a file or annotation ID.
@@ -110,8 +110,16 @@ def read_table(file_id=None, annotation_id=None, column_names=(), rows=None,
     Default None = load all rows.
     :param column_names: Iterable of column name strings to load.
     Default None = load all columns.
+    :param query: String containing the PyTables query which would return a
+    subset of rows from the table. Only rows which pass this query will be
+    returned. Cannot be used with the 'rows' parameter.
+    :param variables: Dictionary containing variables to map onto the query
+    string.
     :return: pandas.DataFrame object containing requested data
     """
+    if rows is not None and query is not None:
+        raise ValueError("Running a query supersedes the rows argument. "
+                         "Please only supply one.")
     object_id, object_type = _validate_requested_object(
         file_id=file_id, annotation_id=annotation_id)
 
@@ -134,7 +142,14 @@ def read_table(file_id=None, annotation_id=None, column_names=(), rows=None,
         else:
             target_cols = range(len(heads))
         # Determine requested rows
-        if rows is None:
+        if query is not None:
+            if variables is None:
+                variables = {}
+            rows = data_table.getWhereList(condition=query,
+                                           variables=variables,
+                                           start=0, stop=-1, step=1)
+            num_rows = len(rows)
+        elif rows is None:
             num_rows = data_table.getNumberOfRows()
         else:
             rows = list(rows)
@@ -203,7 +218,7 @@ def upload_table(dataframe, table_name, parent_id, parent_type='Image',
 def download_table(target_path, file_id=None, annotation_id=None,
                    column_names=(), rows=None, chunk_size=1000,
                    omero_connector=None, server=None, port=4064,
-                   username=None, password=None):
+                   username=None, password=None, query=None, variables=None):
     """
     Downloads table data into a CSV file.
     Supply either a file or annotation ID.
@@ -225,8 +240,16 @@ def download_table(target_path, file_id=None, annotation_id=None,
     Default None = load all rows.
     :param column_names: Iterable of column name strings to load.
     Default None = load all columns.
+    :param query: String containing the PyTables query which would return a
+    subset of rows from the table. Only rows which pass this query will be
+    returned. Cannot be used with the 'rows' parameter.
+    :param variables: Dictionary containing variables to map onto the query
+    string.
     :return: pandas.DataFrame object containing requested data
     """
+    if rows is not None and query is not None:
+        raise ValueError("Running a query supersedes the rows argument. "
+                         "Please only supply one.")
     object_id, object_type = _validate_requested_object(
         file_id=file_id, annotation_id=annotation_id)
 
@@ -253,7 +276,14 @@ def download_table(target_path, file_id=None, annotation_id=None,
         else:
             target_cols = range(len(heads))
         # Determine requested rows
-        if rows is None:
+        if query is not None:
+            if variables is None:
+                variables = {}
+            rows = data_table.getWhereList(condition=query,
+                                           variables=variables,
+                                           start=0, stop=-1, step=1)
+            num_rows = len(rows)
+        elif rows is None:
             num_rows = data_table.getNumberOfRows()
         else:
             rows = list(rows)