fix: Handle anyOf in JSON Schema property (#379)

* fix: Handle `anyOf` in JSON Schema property * Indicate function return value type * Refactor function and add comments * fix: Prefix class name with `nmdc:` * Implement helper function to process both single-ref and multi-ref specs * Document prefix functionality * Fix punctuation in comment * Update dictionary and function to accommodate multiple classes per collection * WIP: Update doc link maker to accommodate collections that map to multiple classes * Clarify variable names * Add comments in an attempt to clarify code * Delete commented-out code that doesn't accommodate multi-class collections * Add tests covering some corner cases * Fix inaccurate type hint * Clarify docstring * Replace reference to nonexistent dict and implement preliminary patch * Make the collection name bold on the search page * Update search page to account for collections mapping to multiple classes * Remove redundant type hints * style: black format * panic on no-type given * add script and api function * update script * Refactor runtime client methods to raise for status and parse and return results * handle omics processing records * update docstring * update to include correct prefix * update to use use new insdc_bioproject_identifiers slot on omics_processing * style: black format * add typecodes enpoint (#386) unauthenticated. closes #385 * update .gitpod.yml * add sshproxy.sh for nersc tunneling * update make cmd * add gitpod affordance * add gitpod dockerfile * update gitpod stuff * rename * update Makefile * fix * gitpod: pull dev mdb * fix * fix make target * Separate dev and production deployments in GitHub workflow (#382) * Consolidate workflows for building docker images and deploying to Spin into one workflow * Remove docker-build.sh in favor of letting GitHub Actions handle Docker build and push * Update Release Process doc with info about initiating via GitHub Releases * Replace Rancher-Action with generic HTTP call * Replace release event with tag push event, which is required for semver metadata * Remove unnecessary pr event * Add more release instructions * style: fix, and elaborate a bit * Revert stuff This reverts commit 1b2372d. * style: fix, and elaborate a bit --------- Co-authored-by: eecavanna <[email protected]> Co-authored-by: Donny Winston <[email protected]> Co-authored-by: Michael Thornton <[email protected]> Co-authored-by: Donny Winston <[email protected]> Co-authored-by: Jing Cao <[email protected]> Co-authored-by: Patrick Kalita <[email protected]>
microbiomedata · Nov 20, 2023 · 30e3838 · 30e3838
1 parent acbc9f9
commit 30e3838
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 65 deletions.
diff --git a/nmdc_runtime/api/core/metadata.py b/nmdc_runtime/api/core/metadata.py
@@ -20,7 +20,7 @@
 from toolz.dicttoolz import dissoc, assoc_in, get_in
 
 from nmdc_runtime.api.models.metadata import ChangesheetIn
-from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_name
+from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_names
 
 # custom named tuple to hold path property information
 SchemaPathProperties = namedtuple(
@@ -169,7 +169,14 @@ def load_changesheet(
             class_name = data["type"].split(":")[-1]
             class_name = class_name_dict[class_name]
         else:
-            class_name = class_name_dict[collection_name_to_class_name[collection_name]]
+            class_names = collection_name_to_class_names[collection_name]
+            if len(class_names) > 1:
+                raise ValueError(
+                    "cannot unambiguously infer class of document"
+                    f" with `id` {id_} in collection {collection_name}."
+                    " Please ensure explicit `type` is present in document."
+                )
+            class_name = class_name_dict[class_names[0]]
 
         # set class name for id
         df["linkml_class"] = class_name

diff --git a/nmdc_runtime/api/endpoints/find.py b/nmdc_runtime/api/endpoints/find.py
@@ -1,4 +1,5 @@
 from operator import itemgetter
+from typing import List
 
 from fastapi import APIRouter, Depends, Form
 from jinja2 import Environment, PackageLoader, select_autoescape
@@ -23,6 +24,7 @@
     PipelineFindRequest,
     PipelineFindResponse,
 )
+from nmdc_runtime.util import get_class_names_from_collection_spec
 
 router = APIRouter()
 
@@ -187,39 +189,57 @@ def attr_index_sort_key(attr):
     return "_" if attr == "id" else attr
 
 
-def documentation_links(jsonschema_dict, collection_names):
-    rv = {"Activity": []}
-    for cn in collection_names:
-        last_part = jsonschema_dict["$defs"]["Database"]["properties"][cn]["items"][
-            "$ref"
-        ].split("/")[-1]
-        entity_attrs = list(jsonschema_dict["$defs"][last_part]["properties"])
-        if last_part in ("Biosample", "Study", "DataObject"):
-            assoc_path = [cn]
-        else:
-            assoc_path = ["activity_set", cn]
-        rv = assoc_in(
-            rv,
-            assoc_path,
-            {
-                "collection_name": cn,
-                "entity_url": "https://microbiomedata.github.io/nmdc-schema/"
-                + last_part,
-                "entity_name": last_part,
+def documentation_links(jsonschema_dict, collection_names) -> dict:
+    """TODO: Add a docstring saying what this function does at a high level."""
+
+    # TODO: Document the purpose of this initial key.
+    doc_links = {"Activity": []}
+
+    # Note: All documentation URLs generated within this function will begin with this.
+    base_url = r"https://microbiomedata.github.io/nmdc-schema"
+
+    for collection_name in collection_names:
+        # Since a given collection can be associated with multiple classes, the `doc_links` dictionary
+        # will have a _list_ of values for each collection.
+        class_descriptors = []
+
+        # If the collection name is one that the `search.html` page has a dedicated section for,
+        # give it a top-level key; otherwise, nest it under `activity_set`.
+        key_hierarchy: List[str] = ["activity_set", collection_name]
+        if collection_name in ("biosample_set", "study_set", "data_object_set"):
+            key_hierarchy = [collection_name]
+
+        # Process the name of each class that the schema associates with this collection.
+        collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
+            collection_name
+        ]
+        class_names = get_class_names_from_collection_spec(collection_spec)
+        for idx, class_name in enumerate(class_names):
+            # Make a list of dictionaries, each of which describes one attribute of this class.
+            entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
+            entity_attr_descriptors = [
+                {"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
+                for attr_name in entity_attrs
+            ]
+
+            # Make a dictionary describing this class.
+            class_descriptor = {
+                "collection_name": collection_name,
+                "entity_url": f"{base_url}/{class_name}",
+                "entity_name": class_name,
                 "entity_attrs": sorted(
-                    [
-                        {
-                            "url": f"https://microbiomedata.github.io/nmdc-schema/{a}",
-                            "attr_name": a,
-                        }
-                        for a in entity_attrs
-                    ],
-                    key=itemgetter("attr_name"),
+                    entity_attr_descriptors, key=itemgetter("attr_name")
                 ),
-            },
-        )
+            }
 
-    return rv
+            # Add that descriptor to this collection's list of class descriptors.
+            class_descriptors.append(class_descriptor)
+
+        # Add a key/value pair describing this collection to the `doc_links` dictionary.
+        # Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
+        doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
+
+    return doc_links
 
 
 @router.get("/search", response_class=HTMLResponse)

diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py
@@ -155,15 +155,6 @@ def fetch_downloaded_json(url, save_dir):
         return json.load(f)
 
 
-# FIX (2021-12-16): this variable does not seem to be used anywhere else.
-# Can it be deleted? Commenting out for now.
-# type_collections = {
-#     f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name
-#     for collection_name, spec in nmdc_jsonschema["properties"].items()
-#     if collection_name.endswith("_set")
-# }
-
-
 @router.post("/metadata/json:validate_urls_file")
 async def validate_json_urls_file(urls_file: UploadFile = File(...)):
     """

diff --git a/nmdc_runtime/templates/search.html b/nmdc_runtime/templates/search.html
@@ -38,6 +38,9 @@
         ul {
             margin-top: 0;
         }
+        .collection-name {
+            font-weight: bold;
+        }
     </style>
 </head>
 <body>
@@ -47,10 +50,10 @@ <h1>NMDC Runtime Search API</h1>
 
 <form action="studies" method="get">
     <label for="filter-studies">
-        Filter <a href="{{ doc_links.study_set.entity_url }}">studies</a>:
+        Filter <a href="{{ doc_links.study_set[0].entity_url }}">studies</a>:
     </label>
     <p>
-        {% for attr in doc_links.study_set.entity_attrs %}
+        {% for attr in doc_links.study_set[0].entity_attrs %}
         <a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
         {% endfor %}
     </p>
@@ -72,10 +75,10 @@ <h4>Copy-and-paste Examples:</h4>
 
 <form action="biosamples" method="get">
     <label for="filter-biosamples">
-        Filter  <a href="{{ doc_links.biosample_set.entity_url }}">biosamples</a>:
+        Filter  <a href="{{ doc_links.biosample_set[0].entity_url }}">biosamples</a>:
     </label>
     <p>
-        {% for attr in doc_links.biosample_set.entity_attrs %}
+        {% for attr in doc_links.biosample_set[0].entity_attrs %}
         <a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
         {% endfor %}
     </p>
@@ -97,10 +100,10 @@ <h4>Copy-and-paste Examples:</h4>
 
 <form action="data_objects" method="get">
     <label for="filter-data_objects">
-        Filter  <a href="{{ doc_links.data_object_set.entity_url }}">data_objects</a>:
+        Filter  <a href="{{ doc_links.data_object_set[0].entity_url }}">data_objects</a>:
         </label>
     <p>
-        {% for attr in doc_links.data_object_set.entity_attrs %}
+        {% for attr in doc_links.data_object_set[0].entity_attrs %}
         <a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
         {% endfor %}
     </p>
@@ -126,9 +129,9 @@ <h4>Copy-and-paste Examples:</h4>
     </label>
     <p>
         {% for cname in activity_collection_names %}
-        <a href="{{ doc_links.activity_set[cname].entity_url }}">{{ doc_links.activity_set[cname].entity_name }}</a>{% if not loop.last %}, {% endif %}
+        <a class="collection-name" href="{{ doc_links.activity_set[cname][0].entity_url }}">{{ doc_links.activity_set[cname][0].entity_name }}</a>{% if not loop.last %}, {% endif %}
         <br/>
-            {% for attr in doc_links.activity_set[cname].entity_attrs %}
+            {% for attr in doc_links.activity_set[cname][0].entity_attrs %}
             <a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
             {% endfor %}
         <br/>

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
@@ -10,6 +10,7 @@
 from io import BytesIO
 from pathlib import Path
 from uuid import uuid4
+from typing import List, Optional, Set, Dict
 
 import fastjsonschema
 import requests
@@ -27,13 +28,67 @@
 from typing_extensions import Annotated
 
 
+def get_class_names_from_collection_spec(
+    spec: dict, prefix: Optional[str] = None
+) -> List[str]:
+    """
+    Returns the list of classes referenced by the `$ref` values in a JSON Schema snippet describing a collection,
+    applying an optional prefix to each class name.
+
+    >>> get_class_names_from_collection_spec({"items": {"foo": "#/$defs/A"}})
+    []
+    >>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}})
+    ['A']
+    >>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}}, "p:")
+    ['p:A']
+    >>> get_class_names_from_collection_spec({"items": {"anyOf": "not-a-list"}})
+    []
+    >>> get_class_names_from_collection_spec({"items": {"anyOf": []}})
+    []
+    >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}]}})
+    ['A']
+    >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}})
+    ['A', 'B']
+    >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}}, "p:")
+    ['p:A', 'p:B']
+    """
+
+    class_names = []
+    if "items" in spec:
+        # If the `items` dictionary has a key named `$ref`, get the single class name from it.
+        if "$ref" in spec["items"]:
+            ref_dict = spec["items"]["$ref"]
+            class_name = ref_dict.split("/")[-1]  # e.g. `#/$defs/Foo` --> `Foo`
+            class_names.append(class_name)
+
+        # Else, if it has a key named `anyOf` whose value is a list, get the class name from each ref in the list.
+        elif "anyOf" in spec["items"] and isinstance(spec["items"]["anyOf"], list):
+            for element in spec["items"]["anyOf"]:
+                ref_dict = element["$ref"]
+                class_name = ref_dict.split("/")[-1]  # e.g. `#/$defs/Foo` --> `Foo`
+                class_names.append(class_name)
+
+    # Apply the specified prefix, if any, to each class name.
+    if isinstance(prefix, str):
+        class_names = list(map(lambda name: f"{prefix}{name}", class_names))
+
+    return class_names
+
+
 @lru_cache
-def get_type_collections():
-    return {
-        f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name
-        for collection_name, spec in nmdc_jsonschema["properties"].items()
-        if collection_name.endswith("_set")
-    }
+def get_type_collections() -> dict:
+    """Returns a dictionary mapping class names to Mongo collection names."""
+
+    mappings = {}
+
+    # Process the `items` dictionary of each collection whose name ends with `_set`.
+    for collection_name, spec in nmdc_jsonschema["properties"].items():
+        if collection_name.endswith("_set"):
+            class_names = get_class_names_from_collection_spec(spec, "nmdc:")
+            for class_name in class_names:
+                mappings[class_name] = collection_name
+
+    return mappings
 
 
 def without_id_patterns(nmdc_jsonschema):
@@ -312,22 +367,30 @@ def specialize_activity_set_docs(docs):
     return docs, validation_errors
 
 
-collection_name_to_class_name = {
-    db_prop: db_prop_spec["items"]["$ref"].split("/")[-1]
-    for db_prop, db_prop_spec in get_nmdc_jsonschema_dict()["$defs"]["Database"][
+# Define a mapping from collection name to a list of class names allowable for that collection's documents.
+collection_name_to_class_names: Dict[str, List[str]] = {
+    collection_name: get_class_names_from_collection_spec(spec)
+    for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
         "properties"
     ].items()
-    if "items" in db_prop_spec and "$ref" in db_prop_spec["items"]
 }
 
 
 @lru_cache
-def schema_collection_names_with_id_field():
-    return {
-        coll_name
-        for coll_name, class_name in collection_name_to_class_name.items()
-        if "id" in get_nmdc_jsonschema_dict()["$defs"][class_name].get("properties", {})
-    }
+def schema_collection_names_with_id_field() -> Set[str]:
+    """
+    Returns the set of collection names with which _any_ of the associated classes contains an `id` field.
+    """
+
+    target_collection_names = set()
+
+    for collection_name, class_names in collection_name_to_class_names.items():
+        for class_name in class_names:
+            if "id" in nmdc_jsonschema["$defs"][class_name].get("properties", {}):
+                target_collection_names.add(collection_name)
+                break
+
+    return target_collection_names
 
 
 def ensure_unique_id_indexes(mdb: MongoDatabase):