From 0d4fe3bf3c2cc1ea8791cafdb5e329d9eed32938 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 11 Dec 2024 21:51:40 -0800
Subject: [PATCH 01/12] Add the `refscan` PyPI package (v0.1.22) as a
 dependency of the Runtime

---
 requirements/main.in  |  3 +++
 requirements/main.txt | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/requirements/main.in b/requirements/main.in
index 2af8c6e7..9d0e1481 100644
--- a/requirements/main.in
+++ b/requirements/main.in
@@ -37,6 +37,9 @@ python-jose[cryptography]
 #       Reference: https://github.com/microbiomedata/nmdc-runtime/security/dependabot/8
 python-multipart>=0.0.18
 pyyaml
+# Note: We use `refscan` to get information about inter-document references from the schema and database.
+#       Reference: https://pypi.org/project/refscan/
+refscan==0.1.22
 requests
 semver
 setuptools-scm
diff --git a/requirements/main.txt b/requirements/main.txt
index 882d9435..5f683fa5 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -21,8 +21,6 @@ anyio==4.7.0
     #   jupyter-server
     #   starlette
     #   watchfiles
-appnope==0.1.4
-    # via ipykernel
 argon2-cffi==23.1.0
     # via jupyter-server
 argon2-cffi-bindings==21.2.0
@@ -94,6 +92,7 @@ click==8.1.7
     #   linkml-runtime
     #   mkdocs
     #   prefixcommons
+    #   typer
     #   uvicorn
 colorama==0.4.6
     # via mkdocs-material
@@ -196,6 +195,8 @@ graphql-relay==3.2.0
     # via graphene
 graphviz==0.20.3
     # via linkml
+greenlet==3.1.1
+    # via sqlalchemy
 grpcio==1.68.1
     # via
     #   dagster
@@ -361,6 +362,7 @@ linkml-runtime==1.8.3
     #   linkml
     #   linkml-dataops
     #   nmdc-schema
+    #   refscan
 lxml==5.3.0
     # via -r requirements/main.in
 mako==1.3.7
@@ -563,6 +565,7 @@ pymongo==4.9.2
     #   -r requirements/main.in
     #   motor
     #   nmdc-schema
+    #   refscan
 pyparsing==3.2.0
     # via rdflib
 pyshex==0.8.1
@@ -649,6 +652,8 @@ referencing==0.35.1
     #   jsonschema
     #   jsonschema-specifications
     #   jupyter-events
+refscan==0.1.22
+    # via -r requirements/main.in
 regex==2024.11.6
     # via mkdocs-material
 requests==2.32.3
@@ -681,7 +686,10 @@ rfc3986-validator==0.1.1
 rfc3987==1.3.8
     # via jsonschema
 rich==13.9.4
-    # via dagster
+    # via
+    #   dagster
+    #   refscan
+    #   typer
 rpds-py==0.22.3
     # via
     #   jsonschema
@@ -702,6 +710,8 @@ send2trash==1.8.3
     # via jupyter-server
 setuptools-scm==8.1.0
     # via -r requirements/main.in
+shellingham==1.5.4
+    # via typer
 shexjsg==0.8.2
     # via
     #   pyshex
@@ -792,6 +802,8 @@ traitlets==5.14.3
     #   nbclient
     #   nbconvert
     #   nbformat
+typer==0.12.5
+    # via refscan
 types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
@@ -810,6 +822,7 @@ typing-extensions==4.12.2
     #   pydantic-core
     #   rich
     #   sqlalchemy
+    #   typer
     #   uvicorn
 tzdata==2024.2
     # via pandas

From a8422aa87a54a0235a0ce6b51ad73c499a5fea0f Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 11 Dec 2024 23:12:53 -0800
Subject: [PATCH 02/12] Generate and cache a list of schema-allowed references
 upon app startup

---
 nmdc_runtime/api/main.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py
index 46518485..25516d74 100644
--- a/nmdc_runtime/api/main.py
+++ b/nmdc_runtime/api/main.py
@@ -1,6 +1,7 @@
 import os
 import re
 from contextlib import asynccontextmanager
+from functools import cache
 from importlib import import_module
 from importlib.metadata import version
 from typing import Annotated
@@ -13,12 +14,16 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.openapi.docs import get_swagger_ui_html
 from fastapi.staticfiles import StaticFiles
+from linkml_runtime.utils.schemaview import SchemaView
+from nmdc_schema.nmdc_data import get_nmdc_schema_definition
+from refscan.lib.helpers import identify_references, ReferenceList
 from setuptools_scm import get_version
 from starlette import status
 from starlette.responses import RedirectResponse, HTMLResponse, FileResponse
 
 from nmdc_runtime.api.analytics import Analytics
 from nmdc_runtime.util import (
+    collection_name_to_class_names,
     ensure_unique_id_indexes,
     REPO_ROOT_DIR,
 )
@@ -354,11 +359,42 @@ def ensure_default_api_perms():
         db["_runtime.api.allow"].create_index("action")
 
 
+@cache  # memoizes the decorated function
+def get_allowed_references() -> ReferenceList:
+    r"""
+    Returns a `ReferenceList` of all the inter-document references that
+    the NMDC Schema allows a schema-compliant MongoDB database to contain.
+    """
+
+    # Instantiate a LinkML `SchemaView` bound to the NMDC Schema.
+    schema_view = SchemaView(get_nmdc_schema_definition())
+
+    # Identify the inter-document references that the schema allows a database to contain.
+    print("Identifying schema-allowed references.")
+    references = identify_references(
+        schema_view=schema_view,
+        collection_name_to_class_names=collection_name_to_class_names
+    )
+
+    return references
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    r"""
+    Prepares the application to receive requests.
+
+    From the [FastAPI documentation](https://fastapi.tiangolo.com/advanced/events/#lifespan-function):
+    > You can define logic (code) that should be executed before the application starts up. This means that
+    > this code will be executed once, before the application starts receiving requests.
+
+    Note: Based on my own observations, I think this function gets called when the first request starts coming in,
+          but not before that (i.e. not when the application is idle before any requests start coming in).
+    """
     ensure_initial_resources_on_boot()
     ensure_attribute_indexes()
     ensure_default_api_perms()
+    _ = get_allowed_references()  # note: future invocations will benefit from the function's memoized-ness
     yield
 
 

From 9bc8d1faee2409690fb423c0ce240fc5051aa3a4 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 11 Dec 2024 23:32:20 -0800
Subject: [PATCH 03/12] Get `SchemaView` via existing `util` function instead
 of independently

---
 nmdc_runtime/api/main.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py
index 25516d74..fa0beff7 100644
--- a/nmdc_runtime/api/main.py
+++ b/nmdc_runtime/api/main.py
@@ -14,8 +14,6 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.openapi.docs import get_swagger_ui_html
 from fastapi.staticfiles import StaticFiles
-from linkml_runtime.utils.schemaview import SchemaView
-from nmdc_schema.nmdc_data import get_nmdc_schema_definition
 from refscan.lib.helpers import identify_references, ReferenceList
 from setuptools_scm import get_version
 from starlette import status
@@ -25,6 +23,7 @@
 from nmdc_runtime.util import (
     collection_name_to_class_names,
     ensure_unique_id_indexes,
+    nmdc_schema_view,
     REPO_ROOT_DIR,
 )
 from nmdc_runtime.api.core.auth import (
@@ -366,13 +365,10 @@ def get_allowed_references() -> ReferenceList:
     the NMDC Schema allows a schema-compliant MongoDB database to contain.
     """
 
-    # Instantiate a LinkML `SchemaView` bound to the NMDC Schema.
-    schema_view = SchemaView(get_nmdc_schema_definition())
-
     # Identify the inter-document references that the schema allows a database to contain.
     print("Identifying schema-allowed references.")
     references = identify_references(
-        schema_view=schema_view,
+        schema_view=nmdc_schema_view(),
         collection_name_to_class_names=collection_name_to_class_names
     )
 

From 7bf29363fca16e10fe8ab11f198d8cfd52d0453f Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 12 Dec 2024 00:47:27 -0800
Subject: [PATCH 04/12] Check referential integrity on
 `/metadata/json:validate` in real time

---
 nmdc_runtime/api/main.py |  21 +------
 nmdc_runtime/util.py     | 120 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py
index fa0beff7..c443ce9c 100644
--- a/nmdc_runtime/api/main.py
+++ b/nmdc_runtime/api/main.py
@@ -14,16 +14,14 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.openapi.docs import get_swagger_ui_html
 from fastapi.staticfiles import StaticFiles
-from refscan.lib.helpers import identify_references, ReferenceList
 from setuptools_scm import get_version
 from starlette import status
 from starlette.responses import RedirectResponse, HTMLResponse, FileResponse
 
 from nmdc_runtime.api.analytics import Analytics
 from nmdc_runtime.util import (
-    collection_name_to_class_names,
+    get_allowed_references,
     ensure_unique_id_indexes,
-    nmdc_schema_view,
     REPO_ROOT_DIR,
 )
 from nmdc_runtime.api.core.auth import (
@@ -358,23 +356,6 @@ def ensure_default_api_perms():
         db["_runtime.api.allow"].create_index("action")
 
 
-@cache  # memoizes the decorated function
-def get_allowed_references() -> ReferenceList:
-    r"""
-    Returns a `ReferenceList` of all the inter-document references that
-    the NMDC Schema allows a schema-compliant MongoDB database to contain.
-    """
-
-    # Identify the inter-document references that the schema allows a database to contain.
-    print("Identifying schema-allowed references.")
-    references = identify_references(
-        schema_view=nmdc_schema_view(),
-        collection_name_to_class_names=collection_name_to_class_names
-    )
-
-    return references
-
-
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     r"""
diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index eee078f9..05a1007a 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -22,6 +22,13 @@
 from pydantic import Field, BaseModel
 from pymongo.database import Database as MongoDatabase
 from pymongo.errors import OperationFailure
+from refscan.lib.helpers import (
+    derive_schema_class_name_from_document,
+    identify_references,
+)
+from refscan.lib.Finder import Finder
+from refscan.lib.ReferenceList import ReferenceList
+from refscan.lib.Violation import Violation
 from toolz import merge, unique
 
 from nmdc_runtime.api.core.util import sha256hash_from_file
@@ -76,6 +83,23 @@ def get_class_names_from_collection_spec(
     return class_names
 
 
+@lru_cache
+def get_allowed_references() -> ReferenceList:
+    r"""
+    Returns a `ReferenceList` of all the inter-document references that
+    the NMDC Schema allows a schema-compliant MongoDB database to contain.
+    """
+
+    # Identify the inter-document references that the schema allows a database to contain.
+    print("Identifying schema-allowed references.")
+    references = identify_references(
+        schema_view=nmdc_schema_view(),
+        collection_name_to_class_names=collection_name_to_class_names
+    )
+
+    return references
+
+
 @lru_cache
 def get_type_collections() -> dict:
     """Returns a dictionary mapping class names to Mongo collection names."""
@@ -497,6 +521,13 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._bottom_db.client.drop_database(self._top_db.name)
 
+    def get_collection(self, coll_name: str):
+        r"""Returns a reference to the specified collection."""
+        try:
+            return self._top_db[coll_name]
+        except OperationFailure as e:
+            raise OverlayDBError(str(e.details))
+
     def replace_or_insert_many(self, coll_name, documents: list):
         try:
             self._top_db[coll_name].insert_many(documents)
@@ -548,6 +579,22 @@ def merge_find(self, coll_name, find_spec: dict):
 
 
 def validate_json(in_docs: dict, mdb: MongoDatabase):
+    r"""
+    Checks whether the specified dictionary represents a valid instance of the `Database` class
+    defined in the NMDC Schema.
+
+    Example dictionary:
+    {
+        "biosample_set": [
+            {"id": "nmdc:bsm-00-000001", ...},
+            {"id": "nmdc:bsm-00-000002", ...}
+        ],
+        "study_set": [
+            {"id": "nmdc:sty-00-000001", ...},
+            {"id": "nmdc:sty-00-000002", ...}
+        ]
+    }
+    """
     validator = Draft7Validator(get_nmdc_jsonschema_dict())
     docs = deepcopy(in_docs)
     validation_errors = {}
@@ -576,6 +623,79 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                 try:
                     with OverlayDB(mdb) as odb:
                         odb.replace_or_insert_many(coll_name, coll_docs)
+
+                        # Check the referential integrity of the replaced or inserted documents.
+                        #
+                        # Note: If documents being inserted into the _current_ collection
+                        #       refer to documents being inserted into a _different_ collection
+                        #       as part of the same `in_docs` argument, this check will _not_
+                        #       find the latter documents.
+                        #
+                        # TODO: Enhance this referential integrity validation to account for the
+                        #       total of all operations; not just a single collection's operations.
+                        #
+                        # Note: Much of this code was copy/pasted from refscan, at:
+                        #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
+                        #
+                        source_collection_name = coll_name  # creates an alias to accommodate the copy/pasted code
+                        finder = Finder(database=odb)  # uses a generic name to accommodate the copy/pasted code
+                        references = get_allowed_references()  # uses a generic name to accommodate the copy/pasted code
+                        reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name()
+                        for document in coll_docs:
+
+                            # Get the document's schema class name so that we can interpret its fields accordingly.
+                            source_class_name = derive_schema_class_name_from_document(
+                                schema_view=nmdc_schema_view(),
+                                document=document,
+                            )
+
+                            # Get the names of that class's fields that can contain references.
+                            # Get the names of that class's fields that can contain references.
+                            names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, [])
+
+                            # Check each field that both (a) exists in the document and (b) can contain a reference.
+                            for field_name in names_of_reference_fields:
+                                if field_name in document:
+
+                                    # Determine which collections can contain the referenced document, based upon
+                                    # the schema class of which this source document is an instance.
+                                    target_collection_names = references.get_target_collection_names(
+                                        source_class_name=source_class_name,
+                                        source_field_name=field_name,
+                                    )
+
+                                    # Handle both the multi-value (array) and the single-value (scalar) case,
+                                    # normalizing the value or values into a list of values in either case.
+                                    if type(document[field_name]) is list:
+                                        target_ids = document[field_name]
+                                    else:
+                                        target_id = document[field_name]
+                                        target_ids = [target_id]  # makes a one-item list
+
+                                    for target_id in target_ids:
+                                        name_of_collection_containing_target_document = (
+                                            finder.check_whether_document_having_id_exists_among_collections(
+                                                collection_names=target_collection_names, document_id=target_id
+                                            )
+                                        )
+                                        if name_of_collection_containing_target_document is None:
+                                            violation = Violation(
+                                                source_collection_name=source_collection_name,
+                                                source_field_name=field_name,
+                                                source_document_object_id=document.get("_id"),
+                                                source_document_id=document.get("id"),
+                                                target_id=target_id,
+                                                name_of_collection_containing_target=None,
+                                            )
+                                            violation_as_str = (f"Document '{violation.source_document_id}' "
+                                                                f"in collection '{violation.source_collection_name}' "
+                                                                f"has a field '{violation.source_field_name}' that "
+                                                                f"references a document having id "
+                                                                f"'{violation.target_id}', but the latter document "
+                                                                f"does not exist in any of the collections the "
+                                                                f"NMDC Schema says it can exist in.")
+                                            raise OverlayDBError(violation_as_str)
+
                 except OverlayDBError as e:
                     validation_errors[coll_name].append(str(e))
 

From d8f69d1db9bdca7df6a87ef06e07cbd45c725acf Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Thu, 12 Dec 2024 08:49:09 +0000
Subject: [PATCH 05/12] style: reformat

---
 nmdc_runtime/api/main.py |  4 ++-
 nmdc_runtime/util.py     | 64 ++++++++++++++++++++++++++--------------
 2 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py
index c443ce9c..6498d4cb 100644
--- a/nmdc_runtime/api/main.py
+++ b/nmdc_runtime/api/main.py
@@ -371,7 +371,9 @@ async def lifespan(app: FastAPI):
     ensure_initial_resources_on_boot()
     ensure_attribute_indexes()
     ensure_default_api_perms()
-    _ = get_allowed_references()  # note: future invocations will benefit from the function's memoized-ness
+    _ = (
+        get_allowed_references()
+    )  # note: future invocations will benefit from the function's memoized-ness
     yield
 
 
diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 05a1007a..8e72689d 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -94,7 +94,7 @@ def get_allowed_references() -> ReferenceList:
     print("Identifying schema-allowed references.")
     references = identify_references(
         schema_view=nmdc_schema_view(),
-        collection_name_to_class_names=collection_name_to_class_names
+        collection_name_to_class_names=collection_name_to_class_names,
     )
 
     return references
@@ -638,9 +638,15 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                         #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
                         #
                         source_collection_name = coll_name  # creates an alias to accommodate the copy/pasted code
-                        finder = Finder(database=odb)  # uses a generic name to accommodate the copy/pasted code
-                        references = get_allowed_references()  # uses a generic name to accommodate the copy/pasted code
-                        reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name()
+                        finder = Finder(
+                            database=odb
+                        )  # uses a generic name to accommodate the copy/pasted code
+                        references = (
+                            get_allowed_references()
+                        )  # uses a generic name to accommodate the copy/pasted code
+                        reference_field_names_by_source_class_name = (
+                            references.get_reference_field_names_by_source_class_name()
+                        )
                         for document in coll_docs:
 
                             # Get the document's schema class name so that we can interpret its fields accordingly.
@@ -651,7 +657,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
 
                             # Get the names of that class's fields that can contain references.
                             # Get the names of that class's fields that can contain references.
-                            names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, [])
+                            names_of_reference_fields = (
+                                reference_field_names_by_source_class_name.get(
+                                    source_class_name, []
+                                )
+                            )
 
                             # Check each field that both (a) exists in the document and (b) can contain a reference.
                             for field_name in names_of_reference_fields:
@@ -659,9 +669,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
 
                                     # Determine which collections can contain the referenced document, based upon
                                     # the schema class of which this source document is an instance.
-                                    target_collection_names = references.get_target_collection_names(
-                                        source_class_name=source_class_name,
-                                        source_field_name=field_name,
+                                    target_collection_names = (
+                                        references.get_target_collection_names(
+                                            source_class_name=source_class_name,
+                                            source_field_name=field_name,
+                                        )
                                     )
 
                                     # Handle both the multi-value (array) and the single-value (scalar) case,
@@ -670,30 +682,38 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                                         target_ids = document[field_name]
                                     else:
                                         target_id = document[field_name]
-                                        target_ids = [target_id]  # makes a one-item list
+                                        target_ids = [
+                                            target_id
+                                        ]  # makes a one-item list
 
                                     for target_id in target_ids:
-                                        name_of_collection_containing_target_document = (
-                                            finder.check_whether_document_having_id_exists_among_collections(
-                                                collection_names=target_collection_names, document_id=target_id
-                                            )
+                                        name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections(
+                                            collection_names=target_collection_names,
+                                            document_id=target_id,
                                         )
-                                        if name_of_collection_containing_target_document is None:
+                                        if (
+                                            name_of_collection_containing_target_document
+                                            is None
+                                        ):
                                             violation = Violation(
                                                 source_collection_name=source_collection_name,
                                                 source_field_name=field_name,
-                                                source_document_object_id=document.get("_id"),
+                                                source_document_object_id=document.get(
+                                                    "_id"
+                                                ),
                                                 source_document_id=document.get("id"),
                                                 target_id=target_id,
                                                 name_of_collection_containing_target=None,
                                             )
-                                            violation_as_str = (f"Document '{violation.source_document_id}' "
-                                                                f"in collection '{violation.source_collection_name}' "
-                                                                f"has a field '{violation.source_field_name}' that "
-                                                                f"references a document having id "
-                                                                f"'{violation.target_id}', but the latter document "
-                                                                f"does not exist in any of the collections the "
-                                                                f"NMDC Schema says it can exist in.")
+                                            violation_as_str = (
+                                                f"Document '{violation.source_document_id}' "
+                                                f"in collection '{violation.source_collection_name}' "
+                                                f"has a field '{violation.source_field_name}' that "
+                                                f"references a document having id "
+                                                f"'{violation.target_id}', but the latter document "
+                                                f"does not exist in any of the collections the "
+                                                f"NMDC Schema says it can exist in."
+                                            )
                                             raise OverlayDBError(violation_as_str)
 
                 except OverlayDBError as e:

From 82ce38e0cb43feb5548fe2cc74fa9e0c1ee4ee70 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 12 Dec 2024 10:56:44 -0800
Subject: [PATCH 06/12] Remove redundant comment, replicate `refscan` code, and
 run `black`

---
 nmdc_runtime/util.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 8e72689d..76e22bbf 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -648,14 +648,12 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                             references.get_reference_field_names_by_source_class_name()
                         )
                         for document in coll_docs:
-
                             # Get the document's schema class name so that we can interpret its fields accordingly.
                             source_class_name = derive_schema_class_name_from_document(
                                 schema_view=nmdc_schema_view(),
                                 document=document,
                             )
 
-                            # Get the names of that class's fields that can contain references.
                             # Get the names of that class's fields that can contain references.
                             names_of_reference_fields = (
                                 reference_field_names_by_source_class_name.get(
@@ -666,7 +664,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                             # Check each field that both (a) exists in the document and (b) can contain a reference.
                             for field_name in names_of_reference_fields:
                                 if field_name in document:
-
                                     # Determine which collections can contain the referenced document, based upon
                                     # the schema class of which this source document is an instance.
                                     target_collection_names = (
@@ -703,7 +700,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                                                 ),
                                                 source_document_id=document.get("id"),
                                                 target_id=target_id,
-                                                name_of_collection_containing_target=None,
+                                                name_of_collection_containing_target=name_of_collection_containing_target_document,
                                             )
                                             violation_as_str = (
                                                 f"Document '{violation.source_document_id}' "

From ae83dc9e7162756dea4c572ac7d2a9daaa176a17 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:39:20 -0800
Subject: [PATCH 07/12] Implement basic automated tests targeting the
 `validate_json` function

---
 tests/test_util/README.md    |  5 ++
 tests/test_util/__init__.py  |  0
 tests/test_util/test_util.py | 99 ++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 tests/test_util/README.md
 create mode 100644 tests/test_util/__init__.py
 create mode 100644 tests/test_util/test_util.py

diff --git a/tests/test_util/README.md b/tests/test_util/README.md
new file mode 100644
index 00000000..c3746163
--- /dev/null
+++ b/tests/test_util/README.md
@@ -0,0 +1,5 @@
+This directory contains file related to testing code written in the `nmdc_runtime/util.py` file.
+
+I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories.
+In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file 
+`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility").
diff --git a/tests/test_util/__init__.py b/tests/test_util/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py
new file mode 100644
index 00000000..4419bbe0
--- /dev/null
+++ b/tests/test_util/test_util.py
@@ -0,0 +1,99 @@
+from nmdc_runtime.api.db.mongo import get_mongo_db
+from nmdc_runtime.util import validate_json
+
+# Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo,
+#      by issuing the following command from the root directory of the repository:
+#      ```
+#      $ pytest tests/test_util/test_util.py
+#      ```
+
+
+def test_validate_json():
+    # Get a reference to the MongoDB database, since the `validate_json` function requires
+    # it to be passed in as a parameter.
+    mdb = get_mongo_db()
+
+    # Define a reusable dictionary that matches the value the `validate_json` function
+    # returns when it considers the input to be valid.
+    ok_result = {"result": "All Okay!"}
+
+    # Test: An empty outer dictionary is valid.
+    database_dict = {}
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result == ok_result
+
+    # Test: An empty collection is valid.
+    database_dict = {"study_set": []}
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result == ok_result
+
+    # Test: Two empty collections is valid.
+    database_dict = {"biosample_set": [], "study_set": []}
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result == ok_result
+
+    # Test: A schema-compliant document is valid.
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            }
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result == ok_result
+
+    # Test: Multiple schema-compliant documents are valid.
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            },
+            {
+                "id": "nmdc:sty-00-000002",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            },
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result == ok_result
+
+    # Test: The function reports an error for the schema-defiant document.
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:OTHER-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            },
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result["result"] == "errors"
+    assert "study_set" in result["detail"]
+    assert len(result["detail"]["study_set"]) == 1
+
+    # Test: The function reports an error for each schema-defiant document.
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:OTHER-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            },
+            {
+                "id": "nmdc:OTHER-00-000002",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+            },
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result["result"] == "errors"
+    assert "study_set" in result["detail"]
+    assert len(result["detail"]["study_set"]) == 2

From 7358d8b309e23bfa2eb0c86f51e5edf76e70a9b2 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:42:14 -0800
Subject: [PATCH 08/12] Add test demonstrating function behavior with invalid
 collection name

---
 tests/test_util/test_util.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py
index 4419bbe0..6653a1d2 100644
--- a/tests/test_util/test_util.py
+++ b/tests/test_util/test_util.py
@@ -27,6 +27,13 @@ def test_validate_json():
     result = validate_json(in_docs=database_dict, mdb=mdb)
     assert result == ok_result
 
+    # Test: The function reports an error for a schema-defiant collection name.
+    database_dict = {"OTHER_set": []}
+    result = validate_json(in_docs=database_dict, mdb=mdb)
+    assert result["result"] == "errors"
+    assert "OTHER_set" in result["detail"]
+    assert len(result["detail"]["OTHER_set"]) == 1
+
     # Test: Two empty collections is valid.
     database_dict = {"biosample_set": [], "study_set": []}
     result = validate_json(in_docs=database_dict, mdb=mdb)

From 52b0865d17673844c8b5fa5f52c230feebcd0880 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:12:07 -0800
Subject: [PATCH 09/12] Rename directory to work around naming conflict
 reported by pytest

---
 tests/test_the_util/README.md                 | 25 +++++++++++++++++++
 .../{test_util => test_the_util}/__init__.py  |  0
 .../test_the_util.py}                         |  2 +-
 tests/test_util/README.md                     |  5 ----
 4 files changed, 26 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_the_util/README.md
 rename tests/{test_util => test_the_util}/__init__.py (100%)
 rename tests/{test_util/test_util.py => test_the_util/test_the_util.py} (98%)
 delete mode 100644 tests/test_util/README.md

diff --git a/tests/test_the_util/README.md b/tests/test_the_util/README.md
new file mode 100644
index 00000000..2c8c24c5
--- /dev/null
+++ b/tests/test_the_util/README.md
@@ -0,0 +1,25 @@
+This directory contains files related to testing code written in the `nmdc_runtime/util.py` file.
+
+### Why this directory is not named `test_util`
+
+I named the directory "`test_the_util`" to work around a limitation of pytest in the context of this repository.
+
+I tried naming the directory "`test_util`" in an attempt to follow the naming convention of the other test directories.
+In its name, "`test`" was a verb and "`util`" was a noun (i.e. "to test the utility"). This was in contrast to the file 
+`../test_util.py` (a file that was already in the repository), in whose name "`test`" is an adjective and "`util`" is a
+noun (i.e. "a test-related utility"). However, with those names in place, `pytest` reported this error:
+
+```py
+_____________________ ERROR collecting tests/test_util.py ______________________
+import file mismatch:
+imported module 'tests.test_util' has this __file__ attribute:
+  /code/tests/test_util
+which is not the same as the test file we want to collect:
+  /code/tests/test_util.py
+HINT: remove __pycache__ / .pyc files and/or use a unique basename for your test file modules
+```
+
+To work around that, I renamed the directory to `test_the_util` and renamed the contained Python file to match.
+
+That is why the name of this test directory does not follow the naming convention
+of the other test directories.
diff --git a/tests/test_util/__init__.py b/tests/test_the_util/__init__.py
similarity index 100%
rename from tests/test_util/__init__.py
rename to tests/test_the_util/__init__.py
diff --git a/tests/test_util/test_util.py b/tests/test_the_util/test_the_util.py
similarity index 98%
rename from tests/test_util/test_util.py
rename to tests/test_the_util/test_the_util.py
index 6653a1d2..ba09d2dd 100644
--- a/tests/test_util/test_util.py
+++ b/tests/test_the_util/test_the_util.py
@@ -4,7 +4,7 @@
 # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo,
 #      by issuing the following command from the root directory of the repository:
 #      ```
-#      $ pytest tests/test_util/test_util.py
+#      $ pytest tests/test_the_util/test_the_util.py
 #      ```
 
 
diff --git a/tests/test_util/README.md b/tests/test_util/README.md
deleted file mode 100644
index c3746163..00000000
--- a/tests/test_util/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-This directory contains file related to testing code written in the `nmdc_runtime/util.py` file.
-
-I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories.
-In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file 
-`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility").

From a7e73c99947cf194cc46dd83abfc0f102900be75 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 17 Dec 2024 20:20:47 -0800
Subject: [PATCH 10/12] Check referential integrity after processing _all_
 specified collections

Previously, we were checking it after inserting documents into _each_
specified collection, which made it so we would not know whether a
referenced document would have been inserted into a later collection.
---
 nmdc_runtime/util.py | 199 +++++++++++++++++++++++--------------------
 1 file changed, 108 insertions(+), 91 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 76e22bbf..d242c31e 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict):
                 yield doc
 
 
-def validate_json(in_docs: dict, mdb: MongoDatabase):
+def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True):
     r"""
     Checks whether the specified dictionary represents a valid instance of the `Database` class
     defined in the NMDC Schema.
@@ -594,6 +594,13 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
             {"id": "nmdc:sty-00-000002", ...}
         ]
     }
+
+    :param dict in_docs: The dictionary you want to validate
+    :param MongoDatabase mdb: A reference to a MongoDB database
+    :param bool check_references: Whether you want this function to check whether every document that is referenced
+                                  by any of the documents passed in would, indeed, exist in the database, if the
+                                  documents passed in were to be inserted into the database. In other words, set this
+                                  to `True` if you want this function to perform referential integrity checks.
     """
     validator = Draft7Validator(get_nmdc_jsonschema_dict())
     docs = deepcopy(in_docs)
@@ -623,96 +630,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
                 try:
                     with OverlayDB(mdb) as odb:
                         odb.replace_or_insert_many(coll_name, coll_docs)
-
-                        # Check the referential integrity of the replaced or inserted documents.
-                        #
-                        # Note: If documents being inserted into the _current_ collection
-                        #       refer to documents being inserted into a _different_ collection
-                        #       as part of the same `in_docs` argument, this check will _not_
-                        #       find the latter documents.
-                        #
-                        # TODO: Enhance this referential integrity validation to account for the
-                        #       total of all operations; not just a single collection's operations.
-                        #
-                        # Note: Much of this code was copy/pasted from refscan, at:
-                        #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
-                        #
-                        source_collection_name = coll_name  # creates an alias to accommodate the copy/pasted code
-                        finder = Finder(
-                            database=odb
-                        )  # uses a generic name to accommodate the copy/pasted code
-                        references = (
-                            get_allowed_references()
-                        )  # uses a generic name to accommodate the copy/pasted code
-                        reference_field_names_by_source_class_name = (
-                            references.get_reference_field_names_by_source_class_name()
-                        )
-                        for document in coll_docs:
-                            # Get the document's schema class name so that we can interpret its fields accordingly.
-                            source_class_name = derive_schema_class_name_from_document(
-                                schema_view=nmdc_schema_view(),
-                                document=document,
-                            )
-
-                            # Get the names of that class's fields that can contain references.
-                            names_of_reference_fields = (
-                                reference_field_names_by_source_class_name.get(
-                                    source_class_name, []
-                                )
-                            )
-
-                            # Check each field that both (a) exists in the document and (b) can contain a reference.
-                            for field_name in names_of_reference_fields:
-                                if field_name in document:
-                                    # Determine which collections can contain the referenced document, based upon
-                                    # the schema class of which this source document is an instance.
-                                    target_collection_names = (
-                                        references.get_target_collection_names(
-                                            source_class_name=source_class_name,
-                                            source_field_name=field_name,
-                                        )
-                                    )
-
-                                    # Handle both the multi-value (array) and the single-value (scalar) case,
-                                    # normalizing the value or values into a list of values in either case.
-                                    if type(document[field_name]) is list:
-                                        target_ids = document[field_name]
-                                    else:
-                                        target_id = document[field_name]
-                                        target_ids = [
-                                            target_id
-                                        ]  # makes a one-item list
-
-                                    for target_id in target_ids:
-                                        name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections(
-                                            collection_names=target_collection_names,
-                                            document_id=target_id,
-                                        )
-                                        if (
-                                            name_of_collection_containing_target_document
-                                            is None
-                                        ):
-                                            violation = Violation(
-                                                source_collection_name=source_collection_name,
-                                                source_field_name=field_name,
-                                                source_document_object_id=document.get(
-                                                    "_id"
-                                                ),
-                                                source_document_id=document.get("id"),
-                                                target_id=target_id,
-                                                name_of_collection_containing_target=name_of_collection_containing_target_document,
-                                            )
-                                            violation_as_str = (
-                                                f"Document '{violation.source_document_id}' "
-                                                f"in collection '{violation.source_collection_name}' "
-                                                f"has a field '{violation.source_field_name}' that "
-                                                f"references a document having id "
-                                                f"'{violation.target_id}', but the latter document "
-                                                f"does not exist in any of the collections the "
-                                                f"NMDC Schema says it can exist in."
-                                            )
-                                            raise OverlayDBError(violation_as_str)
-
                 except OverlayDBError as e:
                     validation_errors[coll_name].append(str(e))
 
@@ -724,6 +641,106 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
         except Exception as e:
             return {"result": "errors", "detail": str(e)}
 
+        # Third pass (if enabled): Check inter-document references.
+        if check_references is True:
+            # Insert all documents specified for all collections specified, into the OverlayDB.
+            #
+            # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to,
+            #       instead, validate it after processing _each_ collection, we would get a false positive if a document
+            #       inserted into an earlier-processed collection happened to reference a document slated for insertion
+            #       into a later-processed collection. By waiting until all documents in all collections specified have
+            #       been inserted, we avoid that scenario.
+            #
+            with OverlayDB(mdb) as overlay_db:
+                print(f"Inserting documents into the OverlayDB.")
+                for collection_name, documents_to_insert in docs.items():
+                    try:
+                        overlay_db.replace_or_insert_many(collection_name, documents_to_insert)
+                    except OverlayDBError as error:
+                        validation_errors[collection_name].append(str(error))
+
+                # Now that the OverlayDB contains all the specified documents, we will check whether
+                # every document referenced by any of the inserted documents exists.
+                finder = Finder(database=overlay_db)
+                references = get_allowed_references()
+                reference_field_names_by_source_class_name = (
+                    references.get_reference_field_names_by_source_class_name()
+                )
+                for source_collection_name, documents_inserted in docs.items():
+                    # Check the referential integrity of the replaced or inserted documents.
+                    #
+                    # Note: Much of this code was copy/pasted from refscan, at:
+                    #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
+                    #
+                    print(f"Checking references emanating from documents inserted into '{source_collection_name}'.")
+                    for document in documents_inserted:
+                        # Get the document's schema class name so that we can interpret its fields accordingly.
+                        source_class_name = derive_schema_class_name_from_document(
+                            schema_view=nmdc_schema_view(),
+                            document=document,
+                        )
+
+                        # Get the names of that class's fields that can contain references.
+                        names_of_reference_fields = (
+                            reference_field_names_by_source_class_name.get(
+                                source_class_name, []
+                            )
+                        )
+
+                        # Check each field that both (a) exists in the document and (b) can contain a reference.
+                        for field_name in names_of_reference_fields:
+                            if field_name in document:
+                                # Determine which collections can contain the referenced document, based upon
+                                # the schema class of which this source document is an instance.
+                                target_collection_names = (
+                                    references.get_target_collection_names(
+                                        source_class_name=source_class_name,
+                                        source_field_name=field_name,
+                                    )
+                                )
+
+                                # Handle both the multi-value (array) and the single-value (scalar) case,
+                                # normalizing the value or values into a list of values in either case.
+                                if type(document[field_name]) is list:
+                                    target_ids = document[field_name]
+                                else:
+                                    target_id = document[field_name]
+                                    target_ids = [
+                                        target_id
+                                    ]  # makes a one-item list
+
+                                for target_id in target_ids:
+                                    name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections(
+                                        collection_names=target_collection_names,
+                                        document_id=target_id,
+                                    )
+                                    if (
+                                        name_of_collection_containing_target_document
+                                        is None
+                                    ):
+                                        violation = Violation(
+                                            source_collection_name=source_collection_name,
+                                            source_field_name=field_name,
+                                            source_document_object_id=document.get("_id"),
+                                            source_document_id=document.get("id"),
+                                            target_id=target_id,
+                                            name_of_collection_containing_target=name_of_collection_containing_target_document,
+                                        )
+                                        violation_as_str = (
+                                            f"Document '{violation.source_document_id}' "
+                                            f"in collection '{violation.source_collection_name}' "
+                                            f"has a field '{violation.source_field_name}' that "
+                                            f"references a document having id "
+                                            f"'{violation.target_id}', but the latter document "
+                                            f"does not exist in any of the collections the "
+                                            f"NMDC Schema says it can exist in."
+                                        )
+                                        validation_errors[source_collection_name].append(violation_as_str)
+
+            # If any collection's error list is not empty, return an error response.
+            if any(len(v) > 0 for v in validation_errors.values()):
+                return {"result": "errors", "detail": validation_errors}
+
         return {"result": "All Okay!"}
     else:
         return {"result": "errors", "detail": validation_errors}

From 3389a4dab1a044e3272cc70f556a5aea93c97ee7 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Wed, 18 Dec 2024 04:21:15 +0000
Subject: [PATCH 11/12] style: reformat

---
 nmdc_runtime/util.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index d242c31e..81aa8bcd 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -655,7 +655,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr
                 print(f"Inserting documents into the OverlayDB.")
                 for collection_name, documents_to_insert in docs.items():
                     try:
-                        overlay_db.replace_or_insert_many(collection_name, documents_to_insert)
+                        overlay_db.replace_or_insert_many(
+                            collection_name, documents_to_insert
+                        )
                     except OverlayDBError as error:
                         validation_errors[collection_name].append(str(error))
 
@@ -672,7 +674,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr
                     # Note: Much of this code was copy/pasted from refscan, at:
                     #       https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349
                     #
-                    print(f"Checking references emanating from documents inserted into '{source_collection_name}'.")
+                    print(
+                        f"Checking references emanating from documents inserted into '{source_collection_name}'."
+                    )
                     for document in documents_inserted:
                         # Get the document's schema class name so that we can interpret its fields accordingly.
                         source_class_name = derive_schema_class_name_from_document(
@@ -705,9 +709,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr
                                     target_ids = document[field_name]
                                 else:
                                     target_id = document[field_name]
-                                    target_ids = [
-                                        target_id
-                                    ]  # makes a one-item list
+                                    target_ids = [target_id]  # makes a one-item list
 
                                 for target_id in target_ids:
                                     name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections(
@@ -721,7 +723,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr
                                         violation = Violation(
                                             source_collection_name=source_collection_name,
                                             source_field_name=field_name,
-                                            source_document_object_id=document.get("_id"),
+                                            source_document_object_id=document.get(
+                                                "_id"
+                                            ),
                                             source_document_id=document.get("id"),
                                             target_id=target_id,
                                             name_of_collection_containing_target=name_of_collection_containing_target_document,
@@ -735,7 +739,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr
                                             f"does not exist in any of the collections the "
                                             f"NMDC Schema says it can exist in."
                                         )
-                                        validation_errors[source_collection_name].append(violation_as_str)
+                                        validation_errors[
+                                            source_collection_name
+                                        ].append(violation_as_str)
 
             # If any collection's error list is not empty, return an error response.
             if any(len(v) > 0 for v in validation_errors.values()):

From bfd880f96829ea27db19a23e9dcb328d7308aebf Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 17 Dec 2024 20:21:31 -0800
Subject: [PATCH 12/12] Disable referential integrity checking in
 `validate_json` by default

---
 nmdc_runtime/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 81aa8bcd..780ec865 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict):
                 yield doc
 
 
-def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True):
+def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = False):
     r"""
     Checks whether the specified dictionary represents a valid instance of the `Database` class
     defined in the NMDC Schema.