diff --git a/nmdc_runtime/api/core/metadata.py b/nmdc_runtime/api/core/metadata.py index 8f411f25..dc23f88d 100644 --- a/nmdc_runtime/api/core/metadata.py +++ b/nmdc_runtime/api/core/metadata.py @@ -20,7 +20,7 @@ from toolz.dicttoolz import dissoc, assoc_in, get_in from nmdc_runtime.api.models.metadata import ChangesheetIn -from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_name +from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_names # custom named tuple to hold path property information SchemaPathProperties = namedtuple( @@ -169,7 +169,14 @@ def load_changesheet( class_name = data["type"].split(":")[-1] class_name = class_name_dict[class_name] else: - class_name = class_name_dict[collection_name_to_class_name[collection_name]] + class_names = collection_name_to_class_names[collection_name] + if len(class_names) > 1: + raise ValueError( + "cannot unambiguously infer class of document" + f" with `id` {id_} in collection {collection_name}." + " Please ensure explicit `type` is present in document." + ) + class_name = class_name_dict[class_names[0]] # set class name for id df["linkml_class"] = class_name diff --git a/nmdc_runtime/api/endpoints/find.py b/nmdc_runtime/api/endpoints/find.py index 4807c3d3..c6021102 100644 --- a/nmdc_runtime/api/endpoints/find.py +++ b/nmdc_runtime/api/endpoints/find.py @@ -1,4 +1,5 @@ from operator import itemgetter +from typing import List from fastapi import APIRouter, Depends, Form from jinja2 import Environment, PackageLoader, select_autoescape @@ -23,6 +24,7 @@ PipelineFindRequest, PipelineFindResponse, ) +from nmdc_runtime.util import get_class_names_from_collection_spec router = APIRouter() @@ -187,39 +189,57 @@ def attr_index_sort_key(attr): return "_" if attr == "id" else attr -def documentation_links(jsonschema_dict, collection_names): - rv = {"Activity": []} - for cn in collection_names: - last_part = jsonschema_dict["$defs"]["Database"]["properties"][cn]["items"][ - "$ref" - ].split("/")[-1] - entity_attrs = list(jsonschema_dict["$defs"][last_part]["properties"]) - if last_part in ("Biosample", "Study", "DataObject"): - assoc_path = [cn] - else: - assoc_path = ["activity_set", cn] - rv = assoc_in( - rv, - assoc_path, - { - "collection_name": cn, - "entity_url": "https://microbiomedata.github.io/nmdc-schema/" - + last_part, - "entity_name": last_part, +def documentation_links(jsonschema_dict, collection_names) -> dict: + """TODO: Add a docstring saying what this function does at a high level.""" + + # TODO: Document the purpose of this initial key. + doc_links = {"Activity": []} + + # Note: All documentation URLs generated within this function will begin with this. + base_url = r"https://microbiomedata.github.io/nmdc-schema" + + for collection_name in collection_names: + # Since a given collection can be associated with multiple classes, the `doc_links` dictionary + # will have a _list_ of values for each collection. + class_descriptors = [] + + # If the collection name is one that the `search.html` page has a dedicated section for, + # give it a top-level key; otherwise, nest it under `activity_set`. + key_hierarchy: List[str] = ["activity_set", collection_name] + if collection_name in ("biosample_set", "study_set", "data_object_set"): + key_hierarchy = [collection_name] + + # Process the name of each class that the schema associates with this collection. + collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][ + collection_name + ] + class_names = get_class_names_from_collection_spec(collection_spec) + for idx, class_name in enumerate(class_names): + # Make a list of dictionaries, each of which describes one attribute of this class. + entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"]) + entity_attr_descriptors = [ + {"url": f"{base_url}/{attr_name}", "attr_name": attr_name} + for attr_name in entity_attrs + ] + + # Make a dictionary describing this class. + class_descriptor = { + "collection_name": collection_name, + "entity_url": f"{base_url}/{class_name}", + "entity_name": class_name, "entity_attrs": sorted( - [ - { - "url": f"https://microbiomedata.github.io/nmdc-schema/{a}", - "attr_name": a, - } - for a in entity_attrs - ], - key=itemgetter("attr_name"), + entity_attr_descriptors, key=itemgetter("attr_name") ), - }, - ) + } - return rv + # Add that descriptor to this collection's list of class descriptors. + class_descriptors.append(class_descriptor) + + # Add a key/value pair describing this collection to the `doc_links` dictionary. + # Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in + doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors) + + return doc_links @router.get("/search", response_class=HTMLResponse) diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py index ed07892d..8a018e6c 100644 --- a/nmdc_runtime/api/endpoints/metadata.py +++ b/nmdc_runtime/api/endpoints/metadata.py @@ -155,15 +155,6 @@ def fetch_downloaded_json(url, save_dir): return json.load(f) -# FIX (2021-12-16): this variable does not seem to be used anywhere else. -# Can it be deleted? Commenting out for now. -# type_collections = { -# f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name -# for collection_name, spec in nmdc_jsonschema["properties"].items() -# if collection_name.endswith("_set") -# } - - @router.post("/metadata/json:validate_urls_file") async def validate_json_urls_file(urls_file: UploadFile = File(...)): """ diff --git a/nmdc_runtime/templates/search.html b/nmdc_runtime/templates/search.html index aca58e15..c1167439 100644 --- a/nmdc_runtime/templates/search.html +++ b/nmdc_runtime/templates/search.html @@ -38,6 +38,9 @@ ul { margin-top: 0; } + .collection-name { + font-weight: bold; + } @@ -47,10 +50,10 @@

NMDC Runtime Search API

- {% for attr in doc_links.study_set.entity_attrs %} + {% for attr in doc_links.study_set[0].entity_attrs %} {{ attr.attr_name }}{% if not loop.last %}, {% endif %} {% endfor %}

@@ -72,10 +75,10 @@

Copy-and-paste Examples:

- {% for attr in doc_links.biosample_set.entity_attrs %} + {% for attr in doc_links.biosample_set[0].entity_attrs %} {{ attr.attr_name }}{% if not loop.last %}, {% endif %} {% endfor %}

@@ -97,10 +100,10 @@

Copy-and-paste Examples:

- {% for attr in doc_links.data_object_set.entity_attrs %} + {% for attr in doc_links.data_object_set[0].entity_attrs %} {{ attr.attr_name }}{% if not loop.last %}, {% endif %} {% endfor %}

@@ -126,9 +129,9 @@

Copy-and-paste Examples:

{% for cname in activity_collection_names %} - {{ doc_links.activity_set[cname].entity_name }}{% if not loop.last %}, {% endif %} + {{ doc_links.activity_set[cname][0].entity_name }}{% if not loop.last %}, {% endif %}
- {% for attr in doc_links.activity_set[cname].entity_attrs %} + {% for attr in doc_links.activity_set[cname][0].entity_attrs %} {{ attr.attr_name }}{% if not loop.last %}, {% endif %} {% endfor %}
diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 475a98d4..2ac47b69 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -10,6 +10,7 @@ from io import BytesIO from pathlib import Path from uuid import uuid4 +from typing import List, Optional, Set, Dict import fastjsonschema import requests @@ -27,13 +28,67 @@ from typing_extensions import Annotated +def get_class_names_from_collection_spec( + spec: dict, prefix: Optional[str] = None +) -> List[str]: + """ + Returns the list of classes referenced by the `$ref` values in a JSON Schema snippet describing a collection, + applying an optional prefix to each class name. + + >>> get_class_names_from_collection_spec({"items": {"foo": "#/$defs/A"}}) + [] + >>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}}) + ['A'] + >>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}}, "p:") + ['p:A'] + >>> get_class_names_from_collection_spec({"items": {"anyOf": "not-a-list"}}) + [] + >>> get_class_names_from_collection_spec({"items": {"anyOf": []}}) + [] + >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}]}}) + ['A'] + >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}}) + ['A', 'B'] + >>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}}, "p:") + ['p:A', 'p:B'] + """ + + class_names = [] + if "items" in spec: + # If the `items` dictionary has a key named `$ref`, get the single class name from it. + if "$ref" in spec["items"]: + ref_dict = spec["items"]["$ref"] + class_name = ref_dict.split("/")[-1] # e.g. `#/$defs/Foo` --> `Foo` + class_names.append(class_name) + + # Else, if it has a key named `anyOf` whose value is a list, get the class name from each ref in the list. + elif "anyOf" in spec["items"] and isinstance(spec["items"]["anyOf"], list): + for element in spec["items"]["anyOf"]: + ref_dict = element["$ref"] + class_name = ref_dict.split("/")[-1] # e.g. `#/$defs/Foo` --> `Foo` + class_names.append(class_name) + + # Apply the specified prefix, if any, to each class name. + if isinstance(prefix, str): + class_names = list(map(lambda name: f"{prefix}{name}", class_names)) + + return class_names + + @lru_cache -def get_type_collections(): - return { - f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name - for collection_name, spec in nmdc_jsonschema["properties"].items() - if collection_name.endswith("_set") - } +def get_type_collections() -> dict: + """Returns a dictionary mapping class names to Mongo collection names.""" + + mappings = {} + + # Process the `items` dictionary of each collection whose name ends with `_set`. + for collection_name, spec in nmdc_jsonschema["properties"].items(): + if collection_name.endswith("_set"): + class_names = get_class_names_from_collection_spec(spec, "nmdc:") + for class_name in class_names: + mappings[class_name] = collection_name + + return mappings def without_id_patterns(nmdc_jsonschema): @@ -312,22 +367,30 @@ def specialize_activity_set_docs(docs): return docs, validation_errors -collection_name_to_class_name = { - db_prop: db_prop_spec["items"]["$ref"].split("/")[-1] - for db_prop, db_prop_spec in get_nmdc_jsonschema_dict()["$defs"]["Database"][ +# Define a mapping from collection name to a list of class names allowable for that collection's documents. +collection_name_to_class_names: Dict[str, List[str]] = { + collection_name: get_class_names_from_collection_spec(spec) + for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][ "properties" ].items() - if "items" in db_prop_spec and "$ref" in db_prop_spec["items"] } @lru_cache -def schema_collection_names_with_id_field(): - return { - coll_name - for coll_name, class_name in collection_name_to_class_name.items() - if "id" in get_nmdc_jsonschema_dict()["$defs"][class_name].get("properties", {}) - } +def schema_collection_names_with_id_field() -> Set[str]: + """ + Returns the set of collection names with which _any_ of the associated classes contains an `id` field. + """ + + target_collection_names = set() + + for collection_name, class_names in collection_name_to_class_names.items(): + for class_name in class_names: + if "id" in nmdc_jsonschema["$defs"][class_name].get("properties", {}): + target_collection_names.add(collection_name) + break + + return target_collection_names def ensure_unique_id_indexes(mdb: MongoDatabase):