Added new configuration option for additional_df_cols. (#54)

esgf2-us · May 1, 2024 · d30262e · d30262e
1 parent 3961835
commit d30262e
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 20 deletions.
diff --git a/doc/configure.md b/doc/configure.md
@@ -25,6 +25,7 @@ format. You will notice a few keywords:
 - `download_db` - the full path to the SQL database with download records.
 - `globus_indices` - a dictionary whose keys are the Globus index name and values map to a boolean indicating that the index is enabled.
 - `solr_indices` - a dictionary whose keys are the Solr index base url and values map to a boolean indicating that the index is enabled.
+- `additional_df_cols` - a list of additional columns to include in the search results DataFrame. Columns that are not part of the search response will be ignored. Defaults to `["datetime_start", "datetime_stop"]`.
 
 ## Indices
 

diff --git a/intake_esgf/base.py b/intake_esgf/base.py
@@ -328,10 +328,15 @@ def get_dataframe_columns(content: dict[str, Any]) -> list[str]:
     content
         The content (Globus) or document (Solr) returned from the query.
     """
+    # Required columns
+    req = ["version", "data_node"]
 
-    # CMIP5 is a disaster so...
+    # Additional columns from the configuration
+    extra = intake_esgf.conf.get("additional_df_cols", [])
+
+    # Project dependent columns
     if "project" in content and content["project"][0] == "CMIP5":
-        return [
+        proj = [
             "institute",
             "model",
             "experiment",
@@ -340,17 +345,17 @@ def get_dataframe_columns(content: dict[str, Any]) -> list[str]:
             "cmor_table",
             "ensemble",
             "variable",
-            "version",
-            "data_node",
         ]
     # ...everything else (so far) behaves nicely so...
-    if "dataset_id_template_" not in content:
+    elif "dataset_id_template_" not in content:
         raise ValueError(f"No `dataset_id_template_` in {content[id]}")
-    columns = re.findall(
-        r"%\((\w+)\)s",
-        content["dataset_id_template_"][0],
-    )
-    columns = list(set(columns).union(["version", "data_node"]))
+    else:
+        proj = re.findall(
+            r"%\((\w+)\)s",
+            content["dataset_id_template_"][0],
+        )
+
+    columns = list(set(proj).union(req + extra))
     return columns
 
 

diff --git a/intake_esgf/config.py b/intake_esgf/config.py
@@ -22,6 +22,7 @@
         "esgf.nci.org.au": False,
         "esgf-node.ornl.gov": False,
     },
+    "additional_df_cols": ["datetime_start", "datetime_stop"],
     "esg_dataroot": [
         "/p/css03/esgf_publish",
         "/eagle/projects/ESGF2/esg_dataroot",
@@ -76,21 +77,25 @@ def set(
         indices: dict[str, bool] = {},
         all_indices: bool = False,
         esg_dataroot: Union[list[str], None] = None,
-        local_cache: Union[list[str], None] = None
+        local_cache: Union[list[str], None] = None,
+        additional_df_cols: Union[list[str], None] = None
     ):
         """Change intake-esgf configuration options.
 
         Parameters
         ----------
-        indices
-            A dictionary of indices whose use status you wish to change.
-        all_indices
+        indices: dict
+            Indices whose use status you wish to change.
+        all_indices: bool
             Enable to check all indices for dataset information.
-        esg_dataroot
-            A list of read-only locations that we will check for ESGF data.
-        local_cache
-            A list of locations where we read and write data to, prefering the first
+        esg_dataroot: list
+            Read-only locations that we will check for ESGF data.
+        local_cache: list
+            Locations where we read and write data to, prefering the first
             entry.
+        additional_df_cols: list
+            Additional columns to include in the dataframe. Must be part
+            of the search results.
 
         Examples
         --------
@@ -125,6 +130,13 @@ def set(
             self["local_cache"] = (
                 local_cache if isinstance(local_cache, list) else [local_cache]
             )
+        if additional_df_cols is not None:
+            self["additional_df_cols"] = (
+                additional_df_cols
+                if isinstance(additional_df_cols, list)
+                else [additional_df_cols]
+            )
+
         return self._unset(temp)
 
     def __getitem__(self, item):

diff --git a/intake_esgf/tests/test_basic.py b/intake_esgf/tests/test_basic.py
@@ -6,14 +6,18 @@
 
 
 def test_search():
-    with intake_esgf.conf.set(indices={SOLR_TEST: True}):
+    extra = ["datetime_start", "datetime_stop"]
+    with intake_esgf.conf.set(indices={SOLR_TEST: True}, additional_df_cols=extra):
         cat = ESGFCatalog().search(
             experiment_id="historical",
             source_id="CanESM5",
             variable_id=["gpp"],
             variant_label=["r1i1p1f1"],
         )
-        print(cat)
+
+        # Check that user-configured columns are in the dataframe
+        assert all([col in cat.df.columns for col in extra])
+
         ds = cat.to_dataset_dict()
         assert "gpp" in ds
         assert "sftlf" in ds["gpp"]

diff --git a/intake_esgf/tests/test_solr.py b/intake_esgf/tests/test_solr.py
@@ -27,6 +27,7 @@ def test_search():
     assert len(df) > 0
 
 
+
 def test_tracking_ids():
     df = index.from_tracking_ids(["hdl:21.14100/872062df-acae-499b-aa0f-9eaca7681abc"])
     assert len(df) > 0