FIX: Improve test coverage and runtime (#24)

esgf2-us · Jan 25, 2024 · 555eef3 · 555eef3
1 parent 5ea9997
commit 555eef3
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 96 deletions.
diff --git a/intake_esgf/base.py b/intake_esgf/base.py
@@ -182,66 +182,9 @@ def parallel_download(
     return None, None
 
 
-def get_relative_esgf_path(entry: dict[str, Any]) -> Path:
-    """Return the relative ESGF path from the Globus entry."""
-    if "content" not in entry:
-        raise ValueError("'content' not part of the entry.")
-    content = entry["content"]
-    if set(["version", "dataset_id", "directory_format_template_"]).difference(
-        content.keys()
-    ):
-        raise ValueError("Entry content does not contain expected keys.")
-    # For some reason, the `version` in the globus response is just an integer and not
-    # what is used in the file path so I have to parse it out of the `dataset_id`
-    content["version"] = [content["dataset_id"].split("|")[0].split(".")[-1]]
-    # Format the file path using the template in the response
-    file_path = content["directory_format_template_"][0]
-    file_path = Path(
-        file_path.replace("%(root)s/", "")
-        .replace("%(", "{")
-        .replace(")s", "[0]}")
-        .format(**content)
-    )
-    return file_path
-
-
-def combine_file_info(indices, dataset_ids: list[str]) -> dict[str, Any]:
-    """Combine file information for the given datasets from all indices.
-
-    Parameters
-    ----------
-    indices
-        A list of index classes, see `intake_esgf.core`.
-    dataset_ids
-        The dataset_ids for which we are seeking file information.
-    """
-    merged_info = {}
-    for ind in indices:
-        try:
-            infos = ind.get_file_info(dataset_ids)
-        except requests.exceptions.RequestException:
-            continue
-        # loop thru all the infos and uniquely add by path
-        for info in infos:
-            path = info["path"]
-            if path not in merged_info:
-                merged_info[path] = {}
-            for key, val in info.items():
-                if isinstance(val, list):
-                    if key not in merged_info[path]:
-                        merged_info[path][key] = val
-                    else:
-                        merged_info[path][key] += val
-                else:
-                    if key not in merged_info[path]:
-                        merged_info[path][key] = val
-    return [info for key, info in merged_info.items()]
-
-
 def check_for_esgf_dataroot() -> Union[Path, None]:
     """Return a direct path to the ESGF data is it exists."""
     to_check = [
-        "/gpfs/alpine/cli137/proj-shared/ESGF/esg_dataroot/css03_data/",  # OLCF
         "/p/css03/esgf_publish",  # Nimbus
         "/eagle/projects/ESGF2/esg_dataroot",  # ALCF
         "/global/cfs/projectdirs/m3522/cmip6/",  # NERSC data lake

diff --git a/intake_esgf/tests/test_basic.py b/intake_esgf/tests/test_basic.py
@@ -1,26 +1,44 @@
 from intake_esgf import ESGFCatalog
+from intake_esgf.exceptions import NoSearchResults
 
 
 def test_search():
-    cat = ESGFCatalog().search(
-        activity_id="CMIP",
+    cat = ESGFCatalog(esgf1_indices="esgf-node.llnl.gov")
+    print(cat)
+    cat = ESGFCatalog(esgf1_indices=["esgf-node.llnl.gov"]).search(
         experiment_id="historical",
-        source_id="CESM2",
-        variable_id=["gpp", "areacella", "sftlf"],
+        source_id="CanESM5",
+        variable_id=["gpp"],
         variant_label=["r1i1p1f1"],
     )
-    assert len(cat.df) == 3
+    print(cat)
+    ds = cat.to_dataset_dict()
+    assert "gpp" in ds
+    assert "sftlf" in ds["gpp"]
 
 
-def test_global_search():
-    cat = ESGFCatalog(esgf1_indices=True).search(
-        activity_id="CMIP",
+def test_esgroot():
+    cat = ESGFCatalog()
+    cat.set_esgf_data_root(cat.local_cache)
+    cat.search(
         experiment_id="historical",
-        source_id="CESM2",
-        variable_id=["gpp", "areacella", "sftlf"],
+        source_id="CanESM5",
+        variable_id=["gpp"],
         variant_label=["r1i1p1f1"],
     )
-    assert len(cat.df) == 3
+    ds = cat.to_dataset_dict(add_measures=False)
+    assert "gpp" in ds
+    log = cat.session_log()
+    assert "download" not in log
+    assert f"accessed {cat.esgf_data_root}" in cat.session_log()
+
+
+def test_noresults():
+    cat = ESGFCatalog()
+    try:
+        cat.search(variable_id="does_not_exist")
+    except NoSearchResults:
+        pass
 
 
 def test_tracking_ids():
@@ -32,24 +50,49 @@ def test_tracking_ids():
         [
             "hdl:21.14100/0577d84f-9954-494f-8cc8-465aa4fd910e",
             "hdl:21.14100/0972f78b-158e-4c6b-bcdf-7d0d75d7a8cd",
-            "hdl:21.14100/0e4dfb8f-b677-456e-abc7-71e1ebc16deb",
-            "hdl:21.14100/17b6c62f-455b-49bc-8674-564f7ca5ed6a",
-            "hdl:21.14100/1bd030c9-1761-4fca-911e-6ea2b6407bc7",
-            "hdl:21.14100/2844ea5a-4589-4ed4-bbb7-c13e9964a4b7",
         ]
     )
-    assert len(cat.df) == 7
+    assert len(cat.df) == 2
 
 
 def test_add_cell_measures():
     # these measures are in r1i1p1f2 / piControl
     cat = ESGFCatalog().search(
-        variable_id="gpp",
+        variable_id="mrros",
         source_id="UKESM1-0-LL",
         variant_label="r2i1p1f2",
         frequency="mon",
         experiment_id="historical",
     )
-    ds = cat.to_dataset_dict()["gpp"]
+    ds = cat.to_dataset_dict()["mrros"]
     assert "sftlf" in ds
     assert "areacella" in ds
+
+
+def test_modelgroups():
+    cat = ESGFCatalog().search(
+        experiment_id="historical",
+        source_id=["CanESM5", "GFDL-CM4"],
+        variable_id=["tas", "pr"],
+        variant_label=["r1i1p1f1", "r2i1p1f1"],
+        table_id="day",
+    )
+    assert len(cat.model_groups()) == 4
+
+
+def test_remove_ensemble():
+    cat = ESGFCatalog().search(
+        experiment_id="historical",
+        source_id=["UKESM1-0-LL"],
+        variable_id=["tas"],
+        table_id="Amon",
+    )
+    assert len(cat.model_groups()) > 1
+    cat.remove_ensembles()
+    assert len(cat.df) == 1
+    assert cat.df.iloc[0]["member_id"] == "r1i1p1f2"
+
+
+def test_download_dbase():
+    cat = ESGFCatalog()
+    assert len(cat.download_summary().columns)
diff --git a/intake_esgf/tests/test_operators.py b/intake_esgf/tests/test_operators.py
@@ -15,34 +15,17 @@ def test_global_mean():
     cat = ESGFCatalog().search(
         experiment_id=["historical"],
         source_id="CanESM5",
-        activity_id="CMIP",
         variant_label="r1i1p1f1",
-        variable_id=["nbp", "tas", "fgco2"],
+        variable_id=["gpp", "fgco2"],
         frequency="mon",
     )
     dsd = cat.to_dataset_dict(ignore_facets=["table_id"])
     dsd = trim_time(dsd)
     dsd = ops.global_mean(dsd)
-    assert set(["fgco2", "tas", "nbp"]) == set(dsd.keys())
+    assert set(["fgco2", "gpp"]) == set(dsd.keys())
 
 
 def test_ensemble_mean():
-    cat = ESGFCatalog().search(
-        experiment_id="historical",
-        source_id=["CESM2", "CanESM5"],
-        variant_label=["r1i1p1f1", "r2i1p1f1", "r3i1p1f1"],
-        variable_id=["tas", "pr"],
-        frequency="mon",
-    )
-    dsd = cat.to_dataset_dict(ignore_facets=["institution_id", "table_id"])
-    dsd = trim_time(dsd)
-    dsd = ops.ensemble_mean(dsd)
-    assert set(
-        ["CESM2.mean.pr", "CESM2.mean.tas", "CanESM5.mean.pr", "CanESM5.mean.tas"]
-    ) == set(dsd.keys())
-
-
-def test_composition():
     """Run a test on composition of operators.
 
     Operators may be locally defined, but we expect that the only argument taken is a
@@ -54,8 +37,8 @@ def test_composition():
     cat = ESGFCatalog().search(
         experiment_id="historical",
         source_id=["CanESM5"],
-        variant_label=["r1i1p1f1", "r2i1p1f1", "r3i1p1f1"],
-        variable_id=["tas"],
+        variant_label=["r1i1p1f1", "r2i1p1f1"],
+        variable_id=["gpp"],
         frequency="mon",
     )
     ensemble_mean = partial(ops.ensemble_mean, include_std=True)