From 0d4fe3bf3c2cc1ea8791cafdb5e329d9eed32938 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 21:51:40 -0800 Subject: [PATCH 01/12] Add the `refscan` PyPI package (v0.1.22) as a dependency of the Runtime --- requirements/main.in | 3 +++ requirements/main.txt | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/requirements/main.in b/requirements/main.in index 2af8c6e7..9d0e1481 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -37,6 +37,9 @@ python-jose[cryptography] # Reference: https://github.com/microbiomedata/nmdc-runtime/security/dependabot/8 python-multipart>=0.0.18 pyyaml +# Note: We use `refscan` to get information about inter-document references from the schema and database. +# Reference: https://pypi.org/project/refscan/ +refscan==0.1.22 requests semver setuptools-scm diff --git a/requirements/main.txt b/requirements/main.txt index 882d9435..5f683fa5 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -21,8 +21,6 @@ anyio==4.7.0 # jupyter-server # starlette # watchfiles -appnope==0.1.4 - # via ipykernel argon2-cffi==23.1.0 # via jupyter-server argon2-cffi-bindings==21.2.0 @@ -94,6 +92,7 @@ click==8.1.7 # linkml-runtime # mkdocs # prefixcommons + # typer # uvicorn colorama==0.4.6 # via mkdocs-material @@ -196,6 +195,8 @@ graphql-relay==3.2.0 # via graphene graphviz==0.20.3 # via linkml +greenlet==3.1.1 + # via sqlalchemy grpcio==1.68.1 # via # dagster @@ -361,6 +362,7 @@ linkml-runtime==1.8.3 # linkml # linkml-dataops # nmdc-schema + # refscan lxml==5.3.0 # via -r requirements/main.in mako==1.3.7 @@ -563,6 +565,7 @@ pymongo==4.9.2 # -r requirements/main.in # motor # nmdc-schema + # refscan pyparsing==3.2.0 # via rdflib pyshex==0.8.1 @@ -649,6 +652,8 @@ referencing==0.35.1 # jsonschema # jsonschema-specifications # jupyter-events +refscan==0.1.22 + # via -r requirements/main.in regex==2024.11.6 # via mkdocs-material requests==2.32.3 @@ -681,7 +686,10 @@ rfc3986-validator==0.1.1 rfc3987==1.3.8 # via jsonschema rich==13.9.4 - # via dagster + # via + # dagster + # refscan + # typer rpds-py==0.22.3 # via # jsonschema @@ -702,6 +710,8 @@ send2trash==1.8.3 # via jupyter-server setuptools-scm==8.1.0 # via -r requirements/main.in +shellingham==1.5.4 + # via typer shexjsg==0.8.2 # via # pyshex @@ -792,6 +802,8 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat +typer==0.12.5 + # via refscan types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 @@ -810,6 +822,7 @@ typing-extensions==4.12.2 # pydantic-core # rich # sqlalchemy + # typer # uvicorn tzdata==2024.2 # via pandas From a8422aa87a54a0235a0ce6b51ad73c499a5fea0f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 23:12:53 -0800 Subject: [PATCH 02/12] Generate and cache a list of schema-allowed references upon app startup --- nmdc_runtime/api/main.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 46518485..25516d74 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -1,6 +1,7 @@ import os import re from contextlib import asynccontextmanager +from functools import cache from importlib import import_module from importlib.metadata import version from typing import Annotated @@ -13,12 +14,16 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles +from linkml_runtime.utils.schemaview import SchemaView +from nmdc_schema.nmdc_data import get_nmdc_schema_definition +from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse, HTMLResponse, FileResponse from nmdc_runtime.api.analytics import Analytics from nmdc_runtime.util import ( + collection_name_to_class_names, ensure_unique_id_indexes, REPO_ROOT_DIR, ) @@ -354,11 +359,42 @@ def ensure_default_api_perms(): db["_runtime.api.allow"].create_index("action") +@cache # memoizes the decorated function +def get_allowed_references() -> ReferenceList: + r""" + Returns a `ReferenceList` of all the inter-document references that + the NMDC Schema allows a schema-compliant MongoDB database to contain. + """ + + # Instantiate a LinkML `SchemaView` bound to the NMDC Schema. + schema_view = SchemaView(get_nmdc_schema_definition()) + + # Identify the inter-document references that the schema allows a database to contain. + print("Identifying schema-allowed references.") + references = identify_references( + schema_view=schema_view, + collection_name_to_class_names=collection_name_to_class_names + ) + + return references + + @asynccontextmanager async def lifespan(app: FastAPI): + r""" + Prepares the application to receive requests. + + From the [FastAPI documentation](https://fastapi.tiangolo.com/advanced/events/#lifespan-function): + > You can define logic (code) that should be executed before the application starts up. This means that + > this code will be executed once, before the application starts receiving requests. + + Note: Based on my own observations, I think this function gets called when the first request starts coming in, + but not before that (i.e. not when the application is idle before any requests start coming in). + """ ensure_initial_resources_on_boot() ensure_attribute_indexes() ensure_default_api_perms() + _ = get_allowed_references() # note: future invocations will benefit from the function's memoized-ness yield From 9bc8d1faee2409690fb423c0ce240fc5051aa3a4 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 23:32:20 -0800 Subject: [PATCH 03/12] Get `SchemaView` via existing `util` function instead of independently --- nmdc_runtime/api/main.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 25516d74..fa0beff7 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -14,8 +14,6 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles -from linkml_runtime.utils.schemaview import SchemaView -from nmdc_schema.nmdc_data import get_nmdc_schema_definition from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status @@ -25,6 +23,7 @@ from nmdc_runtime.util import ( collection_name_to_class_names, ensure_unique_id_indexes, + nmdc_schema_view, REPO_ROOT_DIR, ) from nmdc_runtime.api.core.auth import ( @@ -366,13 +365,10 @@ def get_allowed_references() -> ReferenceList: the NMDC Schema allows a schema-compliant MongoDB database to contain. """ - # Instantiate a LinkML `SchemaView` bound to the NMDC Schema. - schema_view = SchemaView(get_nmdc_schema_definition()) - # Identify the inter-document references that the schema allows a database to contain. print("Identifying schema-allowed references.") references = identify_references( - schema_view=schema_view, + schema_view=nmdc_schema_view(), collection_name_to_class_names=collection_name_to_class_names ) From 7bf29363fca16e10fe8ab11f198d8cfd52d0453f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 00:47:27 -0800 Subject: [PATCH 04/12] Check referential integrity on `/metadata/json:validate` in real time --- nmdc_runtime/api/main.py | 21 +------ nmdc_runtime/util.py | 120 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 20 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index fa0beff7..c443ce9c 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -14,16 +14,14 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles -from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse, HTMLResponse, FileResponse from nmdc_runtime.api.analytics import Analytics from nmdc_runtime.util import ( - collection_name_to_class_names, + get_allowed_references, ensure_unique_id_indexes, - nmdc_schema_view, REPO_ROOT_DIR, ) from nmdc_runtime.api.core.auth import ( @@ -358,23 +356,6 @@ def ensure_default_api_perms(): db["_runtime.api.allow"].create_index("action") -@cache # memoizes the decorated function -def get_allowed_references() -> ReferenceList: - r""" - Returns a `ReferenceList` of all the inter-document references that - the NMDC Schema allows a schema-compliant MongoDB database to contain. - """ - - # Identify the inter-document references that the schema allows a database to contain. - print("Identifying schema-allowed references.") - references = identify_references( - schema_view=nmdc_schema_view(), - collection_name_to_class_names=collection_name_to_class_names - ) - - return references - - @asynccontextmanager async def lifespan(app: FastAPI): r""" diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index eee078f9..05a1007a 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -22,6 +22,13 @@ from pydantic import Field, BaseModel from pymongo.database import Database as MongoDatabase from pymongo.errors import OperationFailure +from refscan.lib.helpers import ( + derive_schema_class_name_from_document, + identify_references, +) +from refscan.lib.Finder import Finder +from refscan.lib.ReferenceList import ReferenceList +from refscan.lib.Violation import Violation from toolz import merge, unique from nmdc_runtime.api.core.util import sha256hash_from_file @@ -76,6 +83,23 @@ def get_class_names_from_collection_spec( return class_names +@lru_cache +def get_allowed_references() -> ReferenceList: + r""" + Returns a `ReferenceList` of all the inter-document references that + the NMDC Schema allows a schema-compliant MongoDB database to contain. + """ + + # Identify the inter-document references that the schema allows a database to contain. + print("Identifying schema-allowed references.") + references = identify_references( + schema_view=nmdc_schema_view(), + collection_name_to_class_names=collection_name_to_class_names + ) + + return references + + @lru_cache def get_type_collections() -> dict: """Returns a dictionary mapping class names to Mongo collection names.""" @@ -497,6 +521,13 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._bottom_db.client.drop_database(self._top_db.name) + def get_collection(self, coll_name: str): + r"""Returns a reference to the specified collection.""" + try: + return self._top_db[coll_name] + except OperationFailure as e: + raise OverlayDBError(str(e.details)) + def replace_or_insert_many(self, coll_name, documents: list): try: self._top_db[coll_name].insert_many(documents) @@ -548,6 +579,22 @@ def merge_find(self, coll_name, find_spec: dict): def validate_json(in_docs: dict, mdb: MongoDatabase): + r""" + Checks whether the specified dictionary represents a valid instance of the `Database` class + defined in the NMDC Schema. + + Example dictionary: + { + "biosample_set": [ + {"id": "nmdc:bsm-00-000001", ...}, + {"id": "nmdc:bsm-00-000002", ...} + ], + "study_set": [ + {"id": "nmdc:sty-00-000001", ...}, + {"id": "nmdc:sty-00-000002", ...} + ] + } + """ validator = Draft7Validator(get_nmdc_jsonschema_dict()) docs = deepcopy(in_docs) validation_errors = {} @@ -576,6 +623,79 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): try: with OverlayDB(mdb) as odb: odb.replace_or_insert_many(coll_name, coll_docs) + + # Check the referential integrity of the replaced or inserted documents. + # + # Note: If documents being inserted into the _current_ collection + # refer to documents being inserted into a _different_ collection + # as part of the same `in_docs` argument, this check will _not_ + # find the latter documents. + # + # TODO: Enhance this referential integrity validation to account for the + # total of all operations; not just a single collection's operations. + # + # Note: Much of this code was copy/pasted from refscan, at: + # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 + # + source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code + finder = Finder(database=odb) # uses a generic name to accommodate the copy/pasted code + references = get_allowed_references() # uses a generic name to accommodate the copy/pasted code + reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name() + for document in coll_docs: + + # Get the document's schema class name so that we can interpret its fields accordingly. + source_class_name = derive_schema_class_name_from_document( + schema_view=nmdc_schema_view(), + document=document, + ) + + # Get the names of that class's fields that can contain references. + # Get the names of that class's fields that can contain references. + names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, []) + + # Check each field that both (a) exists in the document and (b) can contain a reference. + for field_name in names_of_reference_fields: + if field_name in document: + + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) + + # Handle both the multi-value (array) and the single-value (scalar) case, + # normalizing the value or values into a list of values in either case. + if type(document[field_name]) is list: + target_ids = document[field_name] + else: + target_id = document[field_name] + target_ids = [target_id] # makes a one-item list + + for target_id in target_ids: + name_of_collection_containing_target_document = ( + finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, document_id=target_id + ) + ) + if name_of_collection_containing_target_document is None: + violation = Violation( + source_collection_name=source_collection_name, + source_field_name=field_name, + source_document_object_id=document.get("_id"), + source_document_id=document.get("id"), + target_id=target_id, + name_of_collection_containing_target=None, + ) + violation_as_str = (f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in.") + raise OverlayDBError(violation_as_str) + except OverlayDBError as e: validation_errors[coll_name].append(str(e)) From d8f69d1db9bdca7df6a87ef06e07cbd45c725acf Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 12 Dec 2024 08:49:09 +0000 Subject: [PATCH 05/12] style: reformat --- nmdc_runtime/api/main.py | 4 ++- nmdc_runtime/util.py | 64 ++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index c443ce9c..6498d4cb 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -371,7 +371,9 @@ async def lifespan(app: FastAPI): ensure_initial_resources_on_boot() ensure_attribute_indexes() ensure_default_api_perms() - _ = get_allowed_references() # note: future invocations will benefit from the function's memoized-ness + _ = ( + get_allowed_references() + ) # note: future invocations will benefit from the function's memoized-ness yield diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 05a1007a..8e72689d 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -94,7 +94,7 @@ def get_allowed_references() -> ReferenceList: print("Identifying schema-allowed references.") references = identify_references( schema_view=nmdc_schema_view(), - collection_name_to_class_names=collection_name_to_class_names + collection_name_to_class_names=collection_name_to_class_names, ) return references @@ -638,9 +638,15 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code - finder = Finder(database=odb) # uses a generic name to accommodate the copy/pasted code - references = get_allowed_references() # uses a generic name to accommodate the copy/pasted code - reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name() + finder = Finder( + database=odb + ) # uses a generic name to accommodate the copy/pasted code + references = ( + get_allowed_references() + ) # uses a generic name to accommodate the copy/pasted code + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) for document in coll_docs: # Get the document's schema class name so that we can interpret its fields accordingly. @@ -651,7 +657,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Get the names of that class's fields that can contain references. # Get the names of that class's fields that can contain references. - names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, []) + names_of_reference_fields = ( + reference_field_names_by_source_class_name.get( + source_class_name, [] + ) + ) # Check each field that both (a) exists in the document and (b) can contain a reference. for field_name in names_of_reference_fields: @@ -659,9 +669,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. - target_collection_names = references.get_target_collection_names( - source_class_name=source_class_name, - source_field_name=field_name, + target_collection_names = ( + references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) ) # Handle both the multi-value (array) and the single-value (scalar) case, @@ -670,30 +682,38 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): target_ids = document[field_name] else: target_id = document[field_name] - target_ids = [target_id] # makes a one-item list + target_ids = [ + target_id + ] # makes a one-item list for target_id in target_ids: - name_of_collection_containing_target_document = ( - finder.check_whether_document_having_id_exists_among_collections( - collection_names=target_collection_names, document_id=target_id - ) + name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, + document_id=target_id, ) - if name_of_collection_containing_target_document is None: + if ( + name_of_collection_containing_target_document + is None + ): violation = Violation( source_collection_name=source_collection_name, source_field_name=field_name, - source_document_object_id=document.get("_id"), + source_document_object_id=document.get( + "_id" + ), source_document_id=document.get("id"), target_id=target_id, name_of_collection_containing_target=None, ) - violation_as_str = (f"Document '{violation.source_document_id}' " - f"in collection '{violation.source_collection_name}' " - f"has a field '{violation.source_field_name}' that " - f"references a document having id " - f"'{violation.target_id}', but the latter document " - f"does not exist in any of the collections the " - f"NMDC Schema says it can exist in.") + violation_as_str = ( + f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in." + ) raise OverlayDBError(violation_as_str) except OverlayDBError as e: From 82ce38e0cb43feb5548fe2cc74fa9e0c1ee4ee70 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 10:56:44 -0800 Subject: [PATCH 06/12] Remove redundant comment, replicate `refscan` code, and run `black` --- nmdc_runtime/util.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 8e72689d..76e22bbf 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -648,14 +648,12 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): references.get_reference_field_names_by_source_class_name() ) for document in coll_docs: - # Get the document's schema class name so that we can interpret its fields accordingly. source_class_name = derive_schema_class_name_from_document( schema_view=nmdc_schema_view(), document=document, ) - # Get the names of that class's fields that can contain references. # Get the names of that class's fields that can contain references. names_of_reference_fields = ( reference_field_names_by_source_class_name.get( @@ -666,7 +664,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Check each field that both (a) exists in the document and (b) can contain a reference. for field_name in names_of_reference_fields: if field_name in document: - # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. target_collection_names = ( @@ -703,7 +700,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): ), source_document_id=document.get("id"), target_id=target_id, - name_of_collection_containing_target=None, + name_of_collection_containing_target=name_of_collection_containing_target_document, ) violation_as_str = ( f"Document '{violation.source_document_id}' " From ae83dc9e7162756dea4c572ac7d2a9daaa176a17 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 11:39:20 -0800 Subject: [PATCH 07/12] Implement basic automated tests targeting the `validate_json` function --- tests/test_util/README.md | 5 ++ tests/test_util/__init__.py | 0 tests/test_util/test_util.py | 99 ++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 tests/test_util/README.md create mode 100644 tests/test_util/__init__.py create mode 100644 tests/test_util/test_util.py diff --git a/tests/test_util/README.md b/tests/test_util/README.md new file mode 100644 index 00000000..c3746163 --- /dev/null +++ b/tests/test_util/README.md @@ -0,0 +1,5 @@ +This directory contains file related to testing code written in the `nmdc_runtime/util.py` file. + +I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. +In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file +`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility"). diff --git a/tests/test_util/__init__.py b/tests/test_util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py new file mode 100644 index 00000000..4419bbe0 --- /dev/null +++ b/tests/test_util/test_util.py @@ -0,0 +1,99 @@ +from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.util import validate_json + +# Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, +# by issuing the following command from the root directory of the repository: +# ``` +# $ pytest tests/test_util/test_util.py +# ``` + + +def test_validate_json(): + # Get a reference to the MongoDB database, since the `validate_json` function requires + # it to be passed in as a parameter. + mdb = get_mongo_db() + + # Define a reusable dictionary that matches the value the `validate_json` function + # returns when it considers the input to be valid. + ok_result = {"result": "All Okay!"} + + # Test: An empty outer dictionary is valid. + database_dict = {} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: An empty collection is valid. + database_dict = {"study_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: Two empty collections is valid. + database_dict = {"biosample_set": [], "study_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: A schema-compliant document is valid. + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + } + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: Multiple schema-compliant documents are valid. + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:sty-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: The function reports an error for the schema-defiant document. + database_dict = { + "study_set": [ + { + "id": "nmdc:OTHER-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 1 + + # Test: The function reports an error for each schema-defiant document. + database_dict = { + "study_set": [ + { + "id": "nmdc:OTHER-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:OTHER-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 2 From 7358d8b309e23bfa2eb0c86f51e5edf76e70a9b2 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 11:42:14 -0800 Subject: [PATCH 08/12] Add test demonstrating function behavior with invalid collection name --- tests/test_util/test_util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py index 4419bbe0..6653a1d2 100644 --- a/tests/test_util/test_util.py +++ b/tests/test_util/test_util.py @@ -27,6 +27,13 @@ def test_validate_json(): result = validate_json(in_docs=database_dict, mdb=mdb) assert result == ok_result + # Test: The function reports an error for a schema-defiant collection name. + database_dict = {"OTHER_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "OTHER_set" in result["detail"] + assert len(result["detail"]["OTHER_set"]) == 1 + # Test: Two empty collections is valid. database_dict = {"biosample_set": [], "study_set": []} result = validate_json(in_docs=database_dict, mdb=mdb) From 52b0865d17673844c8b5fa5f52c230feebcd0880 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 12:12:07 -0800 Subject: [PATCH 09/12] Rename directory to work around naming conflict reported by pytest --- tests/test_the_util/README.md | 25 +++++++++++++++++++ .../{test_util => test_the_util}/__init__.py | 0 .../test_the_util.py} | 2 +- tests/test_util/README.md | 5 ---- 4 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 tests/test_the_util/README.md rename tests/{test_util => test_the_util}/__init__.py (100%) rename tests/{test_util/test_util.py => test_the_util/test_the_util.py} (98%) delete mode 100644 tests/test_util/README.md diff --git a/tests/test_the_util/README.md b/tests/test_the_util/README.md new file mode 100644 index 00000000..2c8c24c5 --- /dev/null +++ b/tests/test_the_util/README.md @@ -0,0 +1,25 @@ +This directory contains files related to testing code written in the `nmdc_runtime/util.py` file. + +### Why this directory is not named `test_util` + +I named the directory "`test_the_util`" to work around a limitation of pytest in the context of this repository. + +I tried naming the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. +In its name, "`test`" was a verb and "`util`" was a noun (i.e. "to test the utility"). This was in contrast to the file +`../test_util.py` (a file that was already in the repository), in whose name "`test`" is an adjective and "`util`" is a +noun (i.e. "a test-related utility"). However, with those names in place, `pytest` reported this error: + +```py +_____________________ ERROR collecting tests/test_util.py ______________________ +import file mismatch: +imported module 'tests.test_util' has this __file__ attribute: + /code/tests/test_util +which is not the same as the test file we want to collect: + /code/tests/test_util.py +HINT: remove __pycache__ / .pyc files and/or use a unique basename for your test file modules +``` + +To work around that, I renamed the directory to `test_the_util` and renamed the contained Python file to match. + +That is why the name of this test directory does not follow the naming convention +of the other test directories. diff --git a/tests/test_util/__init__.py b/tests/test_the_util/__init__.py similarity index 100% rename from tests/test_util/__init__.py rename to tests/test_the_util/__init__.py diff --git a/tests/test_util/test_util.py b/tests/test_the_util/test_the_util.py similarity index 98% rename from tests/test_util/test_util.py rename to tests/test_the_util/test_the_util.py index 6653a1d2..ba09d2dd 100644 --- a/tests/test_util/test_util.py +++ b/tests/test_the_util/test_the_util.py @@ -4,7 +4,7 @@ # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, # by issuing the following command from the root directory of the repository: # ``` -# $ pytest tests/test_util/test_util.py +# $ pytest tests/test_the_util/test_the_util.py # ``` diff --git a/tests/test_util/README.md b/tests/test_util/README.md deleted file mode 100644 index c3746163..00000000 --- a/tests/test_util/README.md +++ /dev/null @@ -1,5 +0,0 @@ -This directory contains file related to testing code written in the `nmdc_runtime/util.py` file. - -I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. -In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file -`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility"). From a7e73c99947cf194cc46dd83abfc0f102900be75 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 20:20:47 -0800 Subject: [PATCH 10/12] Check referential integrity after processing _all_ specified collections Previously, we were checking it after inserting documents into _each_ specified collection, which made it so we would not know whether a referenced document would have been inserted into a later collection. --- nmdc_runtime/util.py | 199 +++++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 91 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 76e22bbf..d242c31e 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase): +def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class defined in the NMDC Schema. @@ -594,6 +594,13 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): {"id": "nmdc:sty-00-000002", ...} ] } + + :param dict in_docs: The dictionary you want to validate + :param MongoDatabase mdb: A reference to a MongoDB database + :param bool check_references: Whether you want this function to check whether every document that is referenced + by any of the documents passed in would, indeed, exist in the database, if the + documents passed in were to be inserted into the database. In other words, set this + to `True` if you want this function to perform referential integrity checks. """ validator = Draft7Validator(get_nmdc_jsonschema_dict()) docs = deepcopy(in_docs) @@ -623,96 +630,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): try: with OverlayDB(mdb) as odb: odb.replace_or_insert_many(coll_name, coll_docs) - - # Check the referential integrity of the replaced or inserted documents. - # - # Note: If documents being inserted into the _current_ collection - # refer to documents being inserted into a _different_ collection - # as part of the same `in_docs` argument, this check will _not_ - # find the latter documents. - # - # TODO: Enhance this referential integrity validation to account for the - # total of all operations; not just a single collection's operations. - # - # Note: Much of this code was copy/pasted from refscan, at: - # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 - # - source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code - finder = Finder( - database=odb - ) # uses a generic name to accommodate the copy/pasted code - references = ( - get_allowed_references() - ) # uses a generic name to accommodate the copy/pasted code - reference_field_names_by_source_class_name = ( - references.get_reference_field_names_by_source_class_name() - ) - for document in coll_docs: - # Get the document's schema class name so that we can interpret its fields accordingly. - source_class_name = derive_schema_class_name_from_document( - schema_view=nmdc_schema_view(), - document=document, - ) - - # Get the names of that class's fields that can contain references. - names_of_reference_fields = ( - reference_field_names_by_source_class_name.get( - source_class_name, [] - ) - ) - - # Check each field that both (a) exists in the document and (b) can contain a reference. - for field_name in names_of_reference_fields: - if field_name in document: - # Determine which collections can contain the referenced document, based upon - # the schema class of which this source document is an instance. - target_collection_names = ( - references.get_target_collection_names( - source_class_name=source_class_name, - source_field_name=field_name, - ) - ) - - # Handle both the multi-value (array) and the single-value (scalar) case, - # normalizing the value or values into a list of values in either case. - if type(document[field_name]) is list: - target_ids = document[field_name] - else: - target_id = document[field_name] - target_ids = [ - target_id - ] # makes a one-item list - - for target_id in target_ids: - name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( - collection_names=target_collection_names, - document_id=target_id, - ) - if ( - name_of_collection_containing_target_document - is None - ): - violation = Violation( - source_collection_name=source_collection_name, - source_field_name=field_name, - source_document_object_id=document.get( - "_id" - ), - source_document_id=document.get("id"), - target_id=target_id, - name_of_collection_containing_target=name_of_collection_containing_target_document, - ) - violation_as_str = ( - f"Document '{violation.source_document_id}' " - f"in collection '{violation.source_collection_name}' " - f"has a field '{violation.source_field_name}' that " - f"references a document having id " - f"'{violation.target_id}', but the latter document " - f"does not exist in any of the collections the " - f"NMDC Schema says it can exist in." - ) - raise OverlayDBError(violation_as_str) - except OverlayDBError as e: validation_errors[coll_name].append(str(e)) @@ -724,6 +641,106 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): except Exception as e: return {"result": "errors", "detail": str(e)} + # Third pass (if enabled): Check inter-document references. + if check_references is True: + # Insert all documents specified for all collections specified, into the OverlayDB. + # + # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, + # instead, validate it after processing _each_ collection, we would get a false positive if a document + # inserted into an earlier-processed collection happened to reference a document slated for insertion + # into a later-processed collection. By waiting until all documents in all collections specified have + # been inserted, we avoid that scenario. + # + with OverlayDB(mdb) as overlay_db: + print(f"Inserting documents into the OverlayDB.") + for collection_name, documents_to_insert in docs.items(): + try: + overlay_db.replace_or_insert_many(collection_name, documents_to_insert) + except OverlayDBError as error: + validation_errors[collection_name].append(str(error)) + + # Now that the OverlayDB contains all the specified documents, we will check whether + # every document referenced by any of the inserted documents exists. + finder = Finder(database=overlay_db) + references = get_allowed_references() + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) + for source_collection_name, documents_inserted in docs.items(): + # Check the referential integrity of the replaced or inserted documents. + # + # Note: Much of this code was copy/pasted from refscan, at: + # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 + # + print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") + for document in documents_inserted: + # Get the document's schema class name so that we can interpret its fields accordingly. + source_class_name = derive_schema_class_name_from_document( + schema_view=nmdc_schema_view(), + document=document, + ) + + # Get the names of that class's fields that can contain references. + names_of_reference_fields = ( + reference_field_names_by_source_class_name.get( + source_class_name, [] + ) + ) + + # Check each field that both (a) exists in the document and (b) can contain a reference. + for field_name in names_of_reference_fields: + if field_name in document: + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = ( + references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) + ) + + # Handle both the multi-value (array) and the single-value (scalar) case, + # normalizing the value or values into a list of values in either case. + if type(document[field_name]) is list: + target_ids = document[field_name] + else: + target_id = document[field_name] + target_ids = [ + target_id + ] # makes a one-item list + + for target_id in target_ids: + name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, + document_id=target_id, + ) + if ( + name_of_collection_containing_target_document + is None + ): + violation = Violation( + source_collection_name=source_collection_name, + source_field_name=field_name, + source_document_object_id=document.get("_id"), + source_document_id=document.get("id"), + target_id=target_id, + name_of_collection_containing_target=name_of_collection_containing_target_document, + ) + violation_as_str = ( + f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in." + ) + validation_errors[source_collection_name].append(violation_as_str) + + # If any collection's error list is not empty, return an error response. + if any(len(v) > 0 for v in validation_errors.values()): + return {"result": "errors", "detail": validation_errors} + return {"result": "All Okay!"} else: return {"result": "errors", "detail": validation_errors} From 3389a4dab1a044e3272cc70f556a5aea93c97ee7 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 18 Dec 2024 04:21:15 +0000 Subject: [PATCH 11/12] style: reformat --- nmdc_runtime/util.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index d242c31e..81aa8bcd 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -655,7 +655,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr print(f"Inserting documents into the OverlayDB.") for collection_name, documents_to_insert in docs.items(): try: - overlay_db.replace_or_insert_many(collection_name, documents_to_insert) + overlay_db.replace_or_insert_many( + collection_name, documents_to_insert + ) except OverlayDBError as error: validation_errors[collection_name].append(str(error)) @@ -672,7 +674,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr # Note: Much of this code was copy/pasted from refscan, at: # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # - print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") + print( + f"Checking references emanating from documents inserted into '{source_collection_name}'." + ) for document in documents_inserted: # Get the document's schema class name so that we can interpret its fields accordingly. source_class_name = derive_schema_class_name_from_document( @@ -705,9 +709,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr target_ids = document[field_name] else: target_id = document[field_name] - target_ids = [ - target_id - ] # makes a one-item list + target_ids = [target_id] # makes a one-item list for target_id in target_ids: name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( @@ -721,7 +723,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr violation = Violation( source_collection_name=source_collection_name, source_field_name=field_name, - source_document_object_id=document.get("_id"), + source_document_object_id=document.get( + "_id" + ), source_document_id=document.get("id"), target_id=target_id, name_of_collection_containing_target=name_of_collection_containing_target_document, @@ -735,7 +739,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr f"does not exist in any of the collections the " f"NMDC Schema says it can exist in." ) - validation_errors[source_collection_name].append(violation_as_str) + validation_errors[ + source_collection_name + ].append(violation_as_str) # If any collection's error list is not empty, return an error response. if any(len(v) > 0 for v in validation_errors.values()): From bfd880f96829ea27db19a23e9dcb328d7308aebf Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 20:21:31 -0800 Subject: [PATCH 12/12] Disable referential integrity checking in `validate_json` by default --- nmdc_runtime/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 81aa8bcd..780ec865 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True): +def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = False): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class defined in the NMDC Schema.