Skip to content

Commit

Permalink
FIX: Improve test coverage and runtime (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
nocollier authored Jan 25, 2024
1 parent 5ea9997 commit 555eef3
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 96 deletions.
57 changes: 0 additions & 57 deletions intake_esgf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,66 +182,9 @@ def parallel_download(
return None, None


def get_relative_esgf_path(entry: dict[str, Any]) -> Path:
"""Return the relative ESGF path from the Globus entry."""
if "content" not in entry:
raise ValueError("'content' not part of the entry.")
content = entry["content"]
if set(["version", "dataset_id", "directory_format_template_"]).difference(
content.keys()
):
raise ValueError("Entry content does not contain expected keys.")
# For some reason, the `version` in the globus response is just an integer and not
# what is used in the file path so I have to parse it out of the `dataset_id`
content["version"] = [content["dataset_id"].split("|")[0].split(".")[-1]]
# Format the file path using the template in the response
file_path = content["directory_format_template_"][0]
file_path = Path(
file_path.replace("%(root)s/", "")
.replace("%(", "{")
.replace(")s", "[0]}")
.format(**content)
)
return file_path


def combine_file_info(indices, dataset_ids: list[str]) -> dict[str, Any]:
"""Combine file information for the given datasets from all indices.
Parameters
----------
indices
A list of index classes, see `intake_esgf.core`.
dataset_ids
The dataset_ids for which we are seeking file information.
"""
merged_info = {}
for ind in indices:
try:
infos = ind.get_file_info(dataset_ids)
except requests.exceptions.RequestException:
continue
# loop thru all the infos and uniquely add by path
for info in infos:
path = info["path"]
if path not in merged_info:
merged_info[path] = {}
for key, val in info.items():
if isinstance(val, list):
if key not in merged_info[path]:
merged_info[path][key] = val
else:
merged_info[path][key] += val
else:
if key not in merged_info[path]:
merged_info[path][key] = val
return [info for key, info in merged_info.items()]


def check_for_esgf_dataroot() -> Union[Path, None]:
"""Return a direct path to the ESGF data is it exists."""
to_check = [
"/gpfs/alpine/cli137/proj-shared/ESGF/esg_dataroot/css03_data/", # OLCF
"/p/css03/esgf_publish", # Nimbus
"/eagle/projects/ESGF2/esg_dataroot", # ALCF
"/global/cfs/projectdirs/m3522/cmip6/", # NERSC data lake
Expand Down
79 changes: 61 additions & 18 deletions intake_esgf/tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,44 @@
from intake_esgf import ESGFCatalog
from intake_esgf.exceptions import NoSearchResults


def test_search():
cat = ESGFCatalog().search(
activity_id="CMIP",
cat = ESGFCatalog(esgf1_indices="esgf-node.llnl.gov")
print(cat)
cat = ESGFCatalog(esgf1_indices=["esgf-node.llnl.gov"]).search(
experiment_id="historical",
source_id="CESM2",
variable_id=["gpp", "areacella", "sftlf"],
source_id="CanESM5",
variable_id=["gpp"],
variant_label=["r1i1p1f1"],
)
assert len(cat.df) == 3
print(cat)
ds = cat.to_dataset_dict()
assert "gpp" in ds
assert "sftlf" in ds["gpp"]


def test_global_search():
cat = ESGFCatalog(esgf1_indices=True).search(
activity_id="CMIP",
def test_esgroot():
cat = ESGFCatalog()
cat.set_esgf_data_root(cat.local_cache)
cat.search(
experiment_id="historical",
source_id="CESM2",
variable_id=["gpp", "areacella", "sftlf"],
source_id="CanESM5",
variable_id=["gpp"],
variant_label=["r1i1p1f1"],
)
assert len(cat.df) == 3
ds = cat.to_dataset_dict(add_measures=False)
assert "gpp" in ds
log = cat.session_log()
assert "download" not in log
assert f"accessed {cat.esgf_data_root}" in cat.session_log()


def test_noresults():
cat = ESGFCatalog()
try:
cat.search(variable_id="does_not_exist")
except NoSearchResults:
pass


def test_tracking_ids():
Expand All @@ -32,24 +50,49 @@ def test_tracking_ids():
[
"hdl:21.14100/0577d84f-9954-494f-8cc8-465aa4fd910e",
"hdl:21.14100/0972f78b-158e-4c6b-bcdf-7d0d75d7a8cd",
"hdl:21.14100/0e4dfb8f-b677-456e-abc7-71e1ebc16deb",
"hdl:21.14100/17b6c62f-455b-49bc-8674-564f7ca5ed6a",
"hdl:21.14100/1bd030c9-1761-4fca-911e-6ea2b6407bc7",
"hdl:21.14100/2844ea5a-4589-4ed4-bbb7-c13e9964a4b7",
]
)
assert len(cat.df) == 7
assert len(cat.df) == 2


def test_add_cell_measures():
# these measures are in r1i1p1f2 / piControl
cat = ESGFCatalog().search(
variable_id="gpp",
variable_id="mrros",
source_id="UKESM1-0-LL",
variant_label="r2i1p1f2",
frequency="mon",
experiment_id="historical",
)
ds = cat.to_dataset_dict()["gpp"]
ds = cat.to_dataset_dict()["mrros"]
assert "sftlf" in ds
assert "areacella" in ds


def test_modelgroups():
cat = ESGFCatalog().search(
experiment_id="historical",
source_id=["CanESM5", "GFDL-CM4"],
variable_id=["tas", "pr"],
variant_label=["r1i1p1f1", "r2i1p1f1"],
table_id="day",
)
assert len(cat.model_groups()) == 4


def test_remove_ensemble():
cat = ESGFCatalog().search(
experiment_id="historical",
source_id=["UKESM1-0-LL"],
variable_id=["tas"],
table_id="Amon",
)
assert len(cat.model_groups()) > 1
cat.remove_ensembles()
assert len(cat.df) == 1
assert cat.df.iloc[0]["member_id"] == "r1i1p1f2"


def test_download_dbase():
cat = ESGFCatalog()
assert len(cat.download_summary().columns)
25 changes: 4 additions & 21 deletions intake_esgf/tests/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,17 @@ def test_global_mean():
cat = ESGFCatalog().search(
experiment_id=["historical"],
source_id="CanESM5",
activity_id="CMIP",
variant_label="r1i1p1f1",
variable_id=["nbp", "tas", "fgco2"],
variable_id=["gpp", "fgco2"],
frequency="mon",
)
dsd = cat.to_dataset_dict(ignore_facets=["table_id"])
dsd = trim_time(dsd)
dsd = ops.global_mean(dsd)
assert set(["fgco2", "tas", "nbp"]) == set(dsd.keys())
assert set(["fgco2", "gpp"]) == set(dsd.keys())


def test_ensemble_mean():
cat = ESGFCatalog().search(
experiment_id="historical",
source_id=["CESM2", "CanESM5"],
variant_label=["r1i1p1f1", "r2i1p1f1", "r3i1p1f1"],
variable_id=["tas", "pr"],
frequency="mon",
)
dsd = cat.to_dataset_dict(ignore_facets=["institution_id", "table_id"])
dsd = trim_time(dsd)
dsd = ops.ensemble_mean(dsd)
assert set(
["CESM2.mean.pr", "CESM2.mean.tas", "CanESM5.mean.pr", "CanESM5.mean.tas"]
) == set(dsd.keys())


def test_composition():
"""Run a test on composition of operators.
Operators may be locally defined, but we expect that the only argument taken is a
Expand All @@ -54,8 +37,8 @@ def test_composition():
cat = ESGFCatalog().search(
experiment_id="historical",
source_id=["CanESM5"],
variant_label=["r1i1p1f1", "r2i1p1f1", "r3i1p1f1"],
variable_id=["tas"],
variant_label=["r1i1p1f1", "r2i1p1f1"],
variable_id=["gpp"],
frequency="mon",
)
ensemble_mean = partial(ops.ensemble_mean, include_std=True)
Expand Down

0 comments on commit 555eef3

Please sign in to comment.