Skip to content

Commit

Permalink
fix: Handle anyOf in JSON Schema property (#379)
Browse files Browse the repository at this point in the history
* fix: Handle `anyOf` in JSON Schema property

* Indicate function return value type

* Refactor function and add comments

* fix: Prefix class name with `nmdc:`

* Implement helper function to process both single-ref and multi-ref specs

* Document prefix functionality

* Fix punctuation in comment

* Update dictionary and function to accommodate multiple classes per collection

* WIP: Update doc link maker to accommodate collections that map to multiple classes

* Clarify variable names

* Add comments in an attempt to clarify code

* Delete commented-out code that doesn't accommodate multi-class collections

* Add tests covering some corner cases

* Fix inaccurate type hint

* Clarify docstring

* Replace reference to nonexistent dict and implement preliminary patch

* Make the collection name bold on the search page

* Update search page to account for collections mapping to multiple classes

* Remove redundant type hints

* style: black format

* panic on no-type given

* add script and api function

* update script

* Refactor runtime client methods to raise for status and parse and return results

* handle omics processing records

* update docstring

* update to include correct prefix

* update to use use new insdc_bioproject_identifiers slot on omics_processing

* style: black format

* add typecodes enpoint (#386)

unauthenticated.

closes #385

* update .gitpod.yml

* add sshproxy.sh for nersc tunneling

* update make cmd

* add gitpod affordance

* add gitpod dockerfile

* update gitpod stuff

* rename

* update Makefile

* fix

* gitpod: pull dev mdb

* fix

* fix make target

* Separate dev and production deployments in GitHub workflow (#382)

* Consolidate workflows for building docker images and deploying to Spin into one workflow

* Remove docker-build.sh in favor of letting GitHub Actions handle Docker build and push

* Update Release Process doc with info about initiating via GitHub Releases

* Replace Rancher-Action with generic HTTP call

* Replace release event with tag push event, which is required for semver metadata

* Remove unnecessary pr event

* Add more release instructions

* style: fix, and elaborate a bit

* Revert stuff

This reverts commit 1b2372d.

* style: fix, and elaborate a bit

---------

Co-authored-by: eecavanna <[email protected]>
Co-authored-by: Donny Winston <[email protected]>
Co-authored-by: Michael Thornton <[email protected]>
Co-authored-by: Donny Winston <[email protected]>
Co-authored-by: Jing Cao <[email protected]>
Co-authored-by: Patrick Kalita <[email protected]>
  • Loading branch information
7 people authored Nov 20, 2023
1 parent acbc9f9 commit 30e3838
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 65 deletions.
11 changes: 9 additions & 2 deletions nmdc_runtime/api/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from toolz.dicttoolz import dissoc, assoc_in, get_in

from nmdc_runtime.api.models.metadata import ChangesheetIn
from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_name
from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_names

# custom named tuple to hold path property information
SchemaPathProperties = namedtuple(
Expand Down Expand Up @@ -169,7 +169,14 @@ def load_changesheet(
class_name = data["type"].split(":")[-1]
class_name = class_name_dict[class_name]
else:
class_name = class_name_dict[collection_name_to_class_name[collection_name]]
class_names = collection_name_to_class_names[collection_name]
if len(class_names) > 1:
raise ValueError(
"cannot unambiguously infer class of document"
f" with `id` {id_} in collection {collection_name}."
" Please ensure explicit `type` is present in document."
)
class_name = class_name_dict[class_names[0]]

# set class name for id
df["linkml_class"] = class_name
Expand Down
80 changes: 50 additions & 30 deletions nmdc_runtime/api/endpoints/find.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from operator import itemgetter
from typing import List

from fastapi import APIRouter, Depends, Form
from jinja2 import Environment, PackageLoader, select_autoescape
Expand All @@ -23,6 +24,7 @@
PipelineFindRequest,
PipelineFindResponse,
)
from nmdc_runtime.util import get_class_names_from_collection_spec

router = APIRouter()

Expand Down Expand Up @@ -187,39 +189,57 @@ def attr_index_sort_key(attr):
return "_" if attr == "id" else attr


def documentation_links(jsonschema_dict, collection_names):
rv = {"Activity": []}
for cn in collection_names:
last_part = jsonschema_dict["$defs"]["Database"]["properties"][cn]["items"][
"$ref"
].split("/")[-1]
entity_attrs = list(jsonschema_dict["$defs"][last_part]["properties"])
if last_part in ("Biosample", "Study", "DataObject"):
assoc_path = [cn]
else:
assoc_path = ["activity_set", cn]
rv = assoc_in(
rv,
assoc_path,
{
"collection_name": cn,
"entity_url": "https://microbiomedata.github.io/nmdc-schema/"
+ last_part,
"entity_name": last_part,
def documentation_links(jsonschema_dict, collection_names) -> dict:
"""TODO: Add a docstring saying what this function does at a high level."""

# TODO: Document the purpose of this initial key.
doc_links = {"Activity": []}

# Note: All documentation URLs generated within this function will begin with this.
base_url = r"https://microbiomedata.github.io/nmdc-schema"

for collection_name in collection_names:
# Since a given collection can be associated with multiple classes, the `doc_links` dictionary
# will have a _list_ of values for each collection.
class_descriptors = []

# If the collection name is one that the `search.html` page has a dedicated section for,
# give it a top-level key; otherwise, nest it under `activity_set`.
key_hierarchy: List[str] = ["activity_set", collection_name]
if collection_name in ("biosample_set", "study_set", "data_object_set"):
key_hierarchy = [collection_name]

# Process the name of each class that the schema associates with this collection.
collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
collection_name
]
class_names = get_class_names_from_collection_spec(collection_spec)
for idx, class_name in enumerate(class_names):
# Make a list of dictionaries, each of which describes one attribute of this class.
entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
entity_attr_descriptors = [
{"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
for attr_name in entity_attrs
]

# Make a dictionary describing this class.
class_descriptor = {
"collection_name": collection_name,
"entity_url": f"{base_url}/{class_name}",
"entity_name": class_name,
"entity_attrs": sorted(
[
{
"url": f"https://microbiomedata.github.io/nmdc-schema/{a}",
"attr_name": a,
}
for a in entity_attrs
],
key=itemgetter("attr_name"),
entity_attr_descriptors, key=itemgetter("attr_name")
),
},
)
}

return rv
# Add that descriptor to this collection's list of class descriptors.
class_descriptors.append(class_descriptor)

# Add a key/value pair describing this collection to the `doc_links` dictionary.
# Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)

return doc_links


@router.get("/search", response_class=HTMLResponse)
Expand Down
9 changes: 0 additions & 9 deletions nmdc_runtime/api/endpoints/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,6 @@ def fetch_downloaded_json(url, save_dir):
return json.load(f)


# FIX (2021-12-16): this variable does not seem to be used anywhere else.
# Can it be deleted? Commenting out for now.
# type_collections = {
# f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name
# for collection_name, spec in nmdc_jsonschema["properties"].items()
# if collection_name.endswith("_set")
# }


@router.post("/metadata/json:validate_urls_file")
async def validate_json_urls_file(urls_file: UploadFile = File(...)):
"""
Expand Down
19 changes: 11 additions & 8 deletions nmdc_runtime/templates/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
ul {
margin-top: 0;
}
.collection-name {
font-weight: bold;
}
</style>
</head>
<body>
Expand All @@ -47,10 +50,10 @@ <h1>NMDC Runtime Search API</h1>

<form action="studies" method="get">
<label for="filter-studies">
Filter <a href="{{ doc_links.study_set.entity_url }}">studies</a>:
Filter <a href="{{ doc_links.study_set[0].entity_url }}">studies</a>:
</label>
<p>
{% for attr in doc_links.study_set.entity_attrs %}
{% for attr in doc_links.study_set[0].entity_attrs %}
<a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
{% endfor %}
</p>
Expand All @@ -72,10 +75,10 @@ <h4>Copy-and-paste Examples:</h4>

<form action="biosamples" method="get">
<label for="filter-biosamples">
Filter <a href="{{ doc_links.biosample_set.entity_url }}">biosamples</a>:
Filter <a href="{{ doc_links.biosample_set[0].entity_url }}">biosamples</a>:
</label>
<p>
{% for attr in doc_links.biosample_set.entity_attrs %}
{% for attr in doc_links.biosample_set[0].entity_attrs %}
<a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
{% endfor %}
</p>
Expand All @@ -97,10 +100,10 @@ <h4>Copy-and-paste Examples:</h4>

<form action="data_objects" method="get">
<label for="filter-data_objects">
Filter <a href="{{ doc_links.data_object_set.entity_url }}">data_objects</a>:
Filter <a href="{{ doc_links.data_object_set[0].entity_url }}">data_objects</a>:
</label>
<p>
{% for attr in doc_links.data_object_set.entity_attrs %}
{% for attr in doc_links.data_object_set[0].entity_attrs %}
<a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
{% endfor %}
</p>
Expand All @@ -126,9 +129,9 @@ <h4>Copy-and-paste Examples:</h4>
</label>
<p>
{% for cname in activity_collection_names %}
<a href="{{ doc_links.activity_set[cname].entity_url }}">{{ doc_links.activity_set[cname].entity_name }}</a>{% if not loop.last %}, {% endif %}
<a class="collection-name" href="{{ doc_links.activity_set[cname][0].entity_url }}">{{ doc_links.activity_set[cname][0].entity_name }}</a>{% if not loop.last %}, {% endif %}
<br/>
{% for attr in doc_links.activity_set[cname].entity_attrs %}
{% for attr in doc_links.activity_set[cname][0].entity_attrs %}
<a href="{{ attr.url }}">{{ attr.attr_name }}</a>{% if not loop.last %}, {% endif %}
{% endfor %}
<br/>
Expand Down
95 changes: 79 additions & 16 deletions nmdc_runtime/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from io import BytesIO
from pathlib import Path
from uuid import uuid4
from typing import List, Optional, Set, Dict

import fastjsonschema
import requests
Expand All @@ -27,13 +28,67 @@
from typing_extensions import Annotated


def get_class_names_from_collection_spec(
spec: dict, prefix: Optional[str] = None
) -> List[str]:
"""
Returns the list of classes referenced by the `$ref` values in a JSON Schema snippet describing a collection,
applying an optional prefix to each class name.
>>> get_class_names_from_collection_spec({"items": {"foo": "#/$defs/A"}})
[]
>>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}})
['A']
>>> get_class_names_from_collection_spec({"items": {"$ref": "#/$defs/A"}}, "p:")
['p:A']
>>> get_class_names_from_collection_spec({"items": {"anyOf": "not-a-list"}})
[]
>>> get_class_names_from_collection_spec({"items": {"anyOf": []}})
[]
>>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}]}})
['A']
>>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}})
['A', 'B']
>>> get_class_names_from_collection_spec({"items": {"anyOf": [{"$ref": "#/$defs/A"}, {"$ref": "#/$defs/B"}]}}, "p:")
['p:A', 'p:B']
"""

class_names = []
if "items" in spec:
# If the `items` dictionary has a key named `$ref`, get the single class name from it.
if "$ref" in spec["items"]:
ref_dict = spec["items"]["$ref"]
class_name = ref_dict.split("/")[-1] # e.g. `#/$defs/Foo` --> `Foo`
class_names.append(class_name)

# Else, if it has a key named `anyOf` whose value is a list, get the class name from each ref in the list.
elif "anyOf" in spec["items"] and isinstance(spec["items"]["anyOf"], list):
for element in spec["items"]["anyOf"]:
ref_dict = element["$ref"]
class_name = ref_dict.split("/")[-1] # e.g. `#/$defs/Foo` --> `Foo`
class_names.append(class_name)

# Apply the specified prefix, if any, to each class name.
if isinstance(prefix, str):
class_names = list(map(lambda name: f"{prefix}{name}", class_names))

return class_names


@lru_cache
def get_type_collections():
return {
f'nmdc:{spec["items"]["$ref"].split("/")[-1]}': collection_name
for collection_name, spec in nmdc_jsonschema["properties"].items()
if collection_name.endswith("_set")
}
def get_type_collections() -> dict:
"""Returns a dictionary mapping class names to Mongo collection names."""

mappings = {}

# Process the `items` dictionary of each collection whose name ends with `_set`.
for collection_name, spec in nmdc_jsonschema["properties"].items():
if collection_name.endswith("_set"):
class_names = get_class_names_from_collection_spec(spec, "nmdc:")
for class_name in class_names:
mappings[class_name] = collection_name

return mappings


def without_id_patterns(nmdc_jsonschema):
Expand Down Expand Up @@ -312,22 +367,30 @@ def specialize_activity_set_docs(docs):
return docs, validation_errors


collection_name_to_class_name = {
db_prop: db_prop_spec["items"]["$ref"].split("/")[-1]
for db_prop, db_prop_spec in get_nmdc_jsonschema_dict()["$defs"]["Database"][
# Define a mapping from collection name to a list of class names allowable for that collection's documents.
collection_name_to_class_names: Dict[str, List[str]] = {
collection_name: get_class_names_from_collection_spec(spec)
for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
"properties"
].items()
if "items" in db_prop_spec and "$ref" in db_prop_spec["items"]
}


@lru_cache
def schema_collection_names_with_id_field():
return {
coll_name
for coll_name, class_name in collection_name_to_class_name.items()
if "id" in get_nmdc_jsonschema_dict()["$defs"][class_name].get("properties", {})
}
def schema_collection_names_with_id_field() -> Set[str]:
"""
Returns the set of collection names with which _any_ of the associated classes contains an `id` field.
"""

target_collection_names = set()

for collection_name, class_names in collection_name_to_class_names.items():
for class_name in class_names:
if "id" in nmdc_jsonschema["$defs"][class_name].get("properties", {}):
target_collection_names.add(collection_name)
break

return target_collection_names


def ensure_unique_id_indexes(mdb: MongoDatabase):
Expand Down

0 comments on commit 30e3838

Please sign in to comment.