Skip to content

Commit

Permalink
Added new configuration option for additional_df_cols. (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
huard authored May 1, 2024
1 parent 3961835 commit d30262e
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 20 deletions.
1 change: 1 addition & 0 deletions doc/configure.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ format. You will notice a few keywords:
- `download_db` - the full path to the SQL database with download records.
- `globus_indices` - a dictionary whose keys are the Globus index name and values map to a boolean indicating that the index is enabled.
- `solr_indices` - a dictionary whose keys are the Solr index base url and values map to a boolean indicating that the index is enabled.
- `additional_df_cols` - a list of additional columns to include in the search results DataFrame. Columns that are not part of the search response will be ignored. Defaults to `["datetime_start", "datetime_stop"]`.

## Indices

Expand Down
25 changes: 15 additions & 10 deletions intake_esgf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,15 @@ def get_dataframe_columns(content: dict[str, Any]) -> list[str]:
content
The content (Globus) or document (Solr) returned from the query.
"""
# Required columns
req = ["version", "data_node"]

# CMIP5 is a disaster so...
# Additional columns from the configuration
extra = intake_esgf.conf.get("additional_df_cols", [])

# Project dependent columns
if "project" in content and content["project"][0] == "CMIP5":
return [
proj = [
"institute",
"model",
"experiment",
Expand All @@ -340,17 +345,17 @@ def get_dataframe_columns(content: dict[str, Any]) -> list[str]:
"cmor_table",
"ensemble",
"variable",
"version",
"data_node",
]
# ...everything else (so far) behaves nicely so...
if "dataset_id_template_" not in content:
elif "dataset_id_template_" not in content:
raise ValueError(f"No `dataset_id_template_` in {content[id]}")
columns = re.findall(
r"%\((\w+)\)s",
content["dataset_id_template_"][0],
)
columns = list(set(columns).union(["version", "data_node"]))
else:
proj = re.findall(
r"%\((\w+)\)s",
content["dataset_id_template_"][0],
)

columns = list(set(proj).union(req + extra))
return columns


Expand Down
28 changes: 20 additions & 8 deletions intake_esgf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"esgf.nci.org.au": False,
"esgf-node.ornl.gov": False,
},
"additional_df_cols": ["datetime_start", "datetime_stop"],
"esg_dataroot": [
"/p/css03/esgf_publish",
"/eagle/projects/ESGF2/esg_dataroot",
Expand Down Expand Up @@ -76,21 +77,25 @@ def set(
indices: dict[str, bool] = {},
all_indices: bool = False,
esg_dataroot: Union[list[str], None] = None,
local_cache: Union[list[str], None] = None
local_cache: Union[list[str], None] = None,
additional_df_cols: Union[list[str], None] = None
):
"""Change intake-esgf configuration options.
Parameters
----------
indices
A dictionary of indices whose use status you wish to change.
all_indices
indices: dict
Indices whose use status you wish to change.
all_indices: bool
Enable to check all indices for dataset information.
esg_dataroot
A list of read-only locations that we will check for ESGF data.
local_cache
A list of locations where we read and write data to, prefering the first
esg_dataroot: list
Read-only locations that we will check for ESGF data.
local_cache: list
Locations where we read and write data to, prefering the first
entry.
additional_df_cols: list
Additional columns to include in the dataframe. Must be part
of the search results.
Examples
--------
Expand Down Expand Up @@ -125,6 +130,13 @@ def set(
self["local_cache"] = (
local_cache if isinstance(local_cache, list) else [local_cache]
)
if additional_df_cols is not None:
self["additional_df_cols"] = (
additional_df_cols
if isinstance(additional_df_cols, list)
else [additional_df_cols]
)

return self._unset(temp)

def __getitem__(self, item):
Expand Down
8 changes: 6 additions & 2 deletions intake_esgf/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@


def test_search():
with intake_esgf.conf.set(indices={SOLR_TEST: True}):
extra = ["datetime_start", "datetime_stop"]
with intake_esgf.conf.set(indices={SOLR_TEST: True}, additional_df_cols=extra):
cat = ESGFCatalog().search(
experiment_id="historical",
source_id="CanESM5",
variable_id=["gpp"],
variant_label=["r1i1p1f1"],
)
print(cat)

# Check that user-configured columns are in the dataframe
assert all([col in cat.df.columns for col in extra])

ds = cat.to_dataset_dict()
assert "gpp" in ds
assert "sftlf" in ds["gpp"]
Expand Down
1 change: 1 addition & 0 deletions intake_esgf/tests/test_solr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_search():
assert len(df) > 0



def test_tracking_ids():
df = index.from_tracking_ids(["hdl:21.14100/872062df-acae-499b-aa0f-9eaca7681abc"])
assert len(df) > 0
Expand Down

0 comments on commit d30262e

Please sign in to comment.