microbiomedata · eecavanna · Jan 14, 2025 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py
@@ -1,6 +1,7 @@
 import os
 import re
 from contextlib import asynccontextmanager
+from functools import cache
 from importlib import import_module
 from importlib.metadata import version
 from typing import Annotated
@@ -19,6 +20,7 @@
 
 from nmdc_runtime.api.analytics import Analytics
 from nmdc_runtime.util import (
+    get_allowed_references,
     ensure_unique_id_indexes,
     REPO_ROOT_DIR,
 )
@@ -356,9 +358,22 @@ def ensure_default_api_perms():
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    r"""
+    Prepares the application to receive requests.
+
+    From the [FastAPI documentation](https://fastapi.tiangolo.com/advanced/events/#lifespan-function):
+    > You can define logic (code) that should be executed before the application starts up. This means that
+    > this code will be executed once, before the application starts receiving requests.
+
+    Note: Based on my own observations, I think this function gets called when the first request starts coming in,
+          but not before that (i.e. not when the application is idle before any requests start coming in).
+    """
     ensure_initial_resources_on_boot()
     ensure_attribute_indexes()
     ensure_default_api_perms()
+    _ = (
+        get_allowed_references()
+    )  # note: future invocations will benefit from the function's memoized-ness
     yield
 
 

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
@@ -22,6 +22,13 @@
 from pydantic import Field, BaseModel
 from pymongo.database import Database as MongoDatabase
 from pymongo.errors import OperationFailure
+from refscan.lib.helpers import (
+    derive_schema_class_name_from_document,
+    identify_references,
+)
+from refscan.lib.Finder import Finder
+from refscan.lib.ReferenceList import ReferenceList
+from refscan.lib.Violation import Violation
 from toolz import merge, unique
 
 from nmdc_runtime.api.core.util import sha256hash_from_file
@@ -76,6 +83,23 @@ def get_class_names_from_collection_spec(
     return class_names
 
 
+@lru_cache
+def get_allowed_references() -> ReferenceList:
+    r"""
+    Returns a `ReferenceList` of all the inter-document references that
+    the NMDC Schema allows a schema-compliant MongoDB database to contain.
+    """
+
+    # Identify the inter-document references that the schema allows a database to contain.
+    print("Identifying schema-allowed references.")
+    references = identify_references(
+        schema_view=nmdc_schema_view(),
+        collection_name_to_class_names=collection_name_to_class_names,
+    )
+
+    return references
+
+
 @lru_cache
 def get_type_collections() -> dict:
     """Returns a dictionary mapping class names to Mongo collection names."""
@@ -497,6 +521,13 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._bottom_db.client.drop_database(self._top_db.name)
 
+    def get_collection(self, coll_name: str):
+        r"""Returns a reference to the specified collection."""
+        try:
+            return self._top_db[coll_name]
+        except OperationFailure as e:
+            raise OverlayDBError(str(e.details))
+
     def replace_or_insert_many(self, coll_name, documents: list):
         try:
             self._top_db[coll_name].insert_many(documents)
@@ -547,7 +578,30 @@ def merge_find(self, coll_name, find_spec: dict):
                 yield doc
 
 
-def validate_json(in_docs: dict, mdb: MongoDatabase):
+def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = False):
+    r"""
+    Checks whether the specified dictionary represents a valid instance of the `Database` class
+    defined in the NMDC Schema.
+
+    Example dictionary:
+    {
+        "biosample_set": [
+            {"id": "nmdc:bsm-00-000001", ...},
+            {"id": "nmdc:bsm-00-000002", ...}
+        ],
+        "study_set": [
+            {"id": "nmdc:sty-00-000001", ...},
+            {"id": "nmdc:sty-00-000002", ...}
+        ]
+    }
+
+    :param dict in_docs: The dictionary you want to validate
+    :param MongoDatabase mdb: A reference to a MongoDB database
+    :param bool check_references: Whether you want this function to check whether every document that is referenced
+                                  by any of the documents passed in would, indeed, exist in the database, if the
+                                  documents passed in were to be inserted into the database. In other words, set this
+                                  to `True` if you want this function to perform referential integrity checks.
+    """
     validator = Draft7Validator(get_nmdc_jsonschema_dict())
     docs = deepcopy(in_docs)
     validation_errors = {}
@@ -587,6 +641,112 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
         except Exception as e:
             return {"result": "errors", "detail": str(e)}
 
+        # Third pass (if enabled): Check inter-document references.
+        if check_references is True:
+            # Insert all documents specified for all collections specified, into the OverlayDB.
+            #
+            # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to,
+            #       instead, validate it after processing _each_ collection, we would get a false positive if a document
+            #       inserted into an earlier-processed collection happened to reference a document slated for insertion
+            #       into a later-processed collection. By waiting until all documents in all collections specified have
+            #       been inserted, we avoid that scenario.
+            #
+            with OverlayDB(mdb) as overlay_db:
+                print(f"Inserting documents into the OverlayDB.")
+                for collection_name, documents_to_insert in docs.items():
+                    try:
+                        overlay_db.replace_or_insert_many(
+                            collection_name, documents_to_insert
+                        )
+                    except OverlayDBError as error:
+                        validation_errors[collection_name].append(str(error))
+
+                # Now that the OverlayDB contains all the specified documents, we will check whether
+                # every document referenced by any of the inserted documents exists.
+                finder = Finder(database=overlay_db)
+                references = get_allowed_references()
+                reference_field_names_by_source_class_name = (
+                    references.get_reference_field_names_by_source_class_name()
+                )
+                for source_collection_name, documents_inserted in docs.items():
+                    # Check the referential integrity of the replaced or inserted documents.
+                    #
+                    # Note: Much of this code was copy/pasted from refscan, at:
+                    #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
+                    #
+                    print(
+                        f"Checking references emanating from documents inserted into '{source_collection_name}'."
+                    )
+                    for document in documents_inserted:
+                        # Get the document's schema class name so that we can interpret its fields accordingly.
+                        source_class_name = derive_schema_class_name_from_document(
+                            schema_view=nmdc_schema_view(),
+                            document=document,
+                        )
+
+                        # Get the names of that class's fields that can contain references.
+                        names_of_reference_fields = (
+                            reference_field_names_by_source_class_name.get(
+                                source_class_name, []
+                            )
+                        )
+
+                        # Check each field that both (a) exists in the document and (b) can contain a reference.
+                        for field_name in names_of_reference_fields:
+                            if field_name in document:
+                                # Determine which collections can contain the referenced document, based upon
+                                # the schema class of which this source document is an instance.
+                                target_collection_names = (
+                                    references.get_target_collection_names(
+                                        source_class_name=source_class_name,
+                                        source_field_name=field_name,
+                                    )
+                                )
+
+                                # Handle both the multi-value (array) and the single-value (scalar) case,
+                                # normalizing the value or values into a list of values in either case.
+                                if type(document[field_name]) is list:
+                                    target_ids = document[field_name]
+                                else:
+                                    target_id = document[field_name]
+                                    target_ids = [target_id]  # makes a one-item list
+
+                                for target_id in target_ids:
+                                    name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections(
+                                        collection_names=target_collection_names,
+                                        document_id=target_id,
+                                    )
+                                    if (
+                                        name_of_collection_containing_target_document
+                                        is None
+                                    ):
+                                        violation = Violation(
+                                            source_collection_name=source_collection_name,
+                                            source_field_name=field_name,
+                                            source_document_object_id=document.get(
+                                                "_id"
+                                            ),
+                                            source_document_id=document.get("id"),
+                                            target_id=target_id,
+                                            name_of_collection_containing_target=name_of_collection_containing_target_document,
+                                        )
+                                        violation_as_str = (
+                                            f"Document '{violation.source_document_id}' "
+                                            f"in collection '{violation.source_collection_name}' "
+                                            f"has a field '{violation.source_field_name}' that "
+                                            f"references a document having id "
+                                            f"'{violation.target_id}', but the latter document "
+                                            f"does not exist in any of the collections the "
+                                            f"NMDC Schema says it can exist in."
+                                        )
+                                        validation_errors[
+                                            source_collection_name
+                                        ].append(violation_as_str)
+
+            # If any collection's error list is not empty, return an error response.
+            if any(len(v) > 0 for v in validation_errors.values()):
+                return {"result": "errors", "detail": validation_errors}
+
         return {"result": "All Okay!"}
     else:
         return {"result": "errors", "detail": validation_errors}
diff --git a/requirements/main.in b/requirements/main.in
@@ -37,6 +37,9 @@ python-jose[cryptography]
 #       Reference: https://github.com/microbiomedata/nmdc-runtime/security/dependabot/8
 python-multipart>=0.0.18
 pyyaml
+# Note: We use `refscan` to get information about inter-document references from the schema and database.
+#       Reference: https://pypi.org/project/refscan/
+refscan==0.1.22
 requests
 semver
 setuptools-scm

diff --git a/requirements/main.txt b/requirements/main.txt
@@ -21,8 +21,6 @@ anyio==4.7.0
     #   jupyter-server
     #   starlette
     #   watchfiles
-appnope==0.1.4
-    # via ipykernel
 argon2-cffi==23.1.0
     # via jupyter-server
 argon2-cffi-bindings==21.2.0
@@ -94,6 +92,7 @@ click==8.1.7
     #   linkml-runtime
     #   mkdocs
     #   prefixcommons
+    #   typer
     #   uvicorn
 colorama==0.4.6
     # via mkdocs-material
@@ -196,6 +195,8 @@ graphql-relay==3.2.0
     # via graphene
 graphviz==0.20.3
     # via linkml
+greenlet==3.1.1
+    # via sqlalchemy
 grpcio==1.68.1
     # via
     #   dagster
@@ -361,6 +362,7 @@ linkml-runtime==1.8.3
     #   linkml
     #   linkml-dataops
     #   nmdc-schema
+    #   refscan
 lxml==5.3.0
     # via -r requirements/main.in
 mako==1.3.7
@@ -563,6 +565,7 @@ pymongo==4.9.2
     #   -r requirements/main.in
     #   motor
     #   nmdc-schema
+    #   refscan
 pyparsing==3.2.0
     # via rdflib
 pyshex==0.8.1
@@ -649,6 +652,8 @@ referencing==0.35.1
     #   jsonschema
     #   jsonschema-specifications
     #   jupyter-events
+refscan==0.1.22
+    # via -r requirements/main.in
 regex==2024.11.6
     # via mkdocs-material
 requests==2.32.3
@@ -681,7 +686,10 @@ rfc3986-validator==0.1.1
 rfc3987==1.3.8
     # via jsonschema
 rich==13.9.4
-    # via dagster
+    # via
+    #   dagster
+    #   refscan
+    #   typer
 rpds-py==0.22.3
     # via
     #   jsonschema
@@ -702,6 +710,8 @@ send2trash==1.8.3
     # via jupyter-server
 setuptools-scm==8.1.0
     # via -r requirements/main.in
+shellingham==1.5.4
+    # via typer
 shexjsg==0.8.2
     # via
     #   pyshex
@@ -792,6 +802,8 @@ traitlets==5.14.3
     #   nbclient
     #   nbconvert
     #   nbformat
+typer==0.12.5
+    # via refscan
 types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
@@ -810,6 +822,7 @@ typing-extensions==4.12.2
     #   pydantic-core
     #   rich
     #   sqlalchemy
+    #   typer
     #   uvicorn
 tzdata==2024.2
     # via pandas

diff --git a/tests/test_the_util/README.md b/tests/test_the_util/README.md
@@ -0,0 +1,25 @@
+This directory contains files related to testing code written in the `nmdc_runtime/util.py` file.
+
+### Why this directory is not named `test_util`
+
+I named the directory "`test_the_util`" to work around a limitation of pytest in the context of this repository.
+
+I tried naming the directory "`test_util`" in an attempt to follow the naming convention of the other test directories.
+In its name, "`test`" was a verb and "`util`" was a noun (i.e. "to test the utility"). This was in contrast to the file 
+`../test_util.py` (a file that was already in the repository), in whose name "`test`" is an adjective and "`util`" is a
+noun (i.e. "a test-related utility"). However, with those names in place, `pytest` reported this error:
+
+```py
+_____________________ ERROR collecting tests/test_util.py ______________________
+import file mismatch:
+imported module 'tests.test_util' has this __file__ attribute:
+  /code/tests/test_util
+which is not the same as the test file we want to collect:
+  /code/tests/test_util.py
+HINT: remove __pycache__ / .pyc files and/or use a unique basename for your test file modules
+```
+
+To work around that, I renamed the directory to `test_the_util` and renamed the contained Python file to match.
+
+That is why the name of this test directory does not follow the naming convention
+of the other test directories.
diff --git a/tests/test_the_util/__init__.py b/tests/test_the_util/__init__.py