Merge pull request #532 from microbiomedata/531-implement-api-endpoin…

…t-that-accepts-id-value-and-returns-its-class-name-and-collection-name Implement API endpoint that returns name of collection containing document having a given `id`
microbiomedata · Jun 13, 2024 · 6bcf735 · 6bcf735
2 parents 8de5ef1 + d3426e9
commit 6bcf735
Show file tree

Hide file tree

Showing 2 changed files with 136 additions and 0 deletions.
diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py
@@ -1,4 +1,5 @@
 from importlib.metadata import version
+import re
 
 import pymongo
 from fastapi import APIRouter, Depends, HTTPException
@@ -8,6 +9,8 @@
 from pymongo.database import Database as MongoDatabase
 from starlette import status
 from toolz import dissoc
+from linkml_runtime.utils.schemaview import SchemaView
+from nmdc_schema.nmdc_data import get_nmdc_schema_definition
 
 from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id
 from nmdc_runtime.api.core.util import raise404_if_none
@@ -131,6 +134,103 @@ def get_by_id(
     )
 
 
+@router.get("/nmdcschema/ids/{doc_id}/collection-name")
+def get_collection_name_by_doc_id(
+    doc_id: str,
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    r"""
+    Returns the name of the collection, if any, containing the document having the specified `id`.
+
+    This endpoint uses the NMDC Schema to determine the schema class of which an instance could have
+    the specified value as its `id`; and then uses the NMDC Schema to determine the names of the
+    `Database` slots (i.e. Mongo collection names) that could contain instances of that schema class.
+
+    This endpoint then searches those Mongo collections for a document having that `id`.
+    If it finds one, it responds with the name of the collection containing the document.
+    If it does not find one, it response with an `HTTP 404 Not Found` response.
+    """
+    # Note: The `nmdc_runtime.api.core.metadata.map_id_to_collection` function is
+    #       not used here because that function (a) only processes collections whose
+    #       names end with `_set` and (b) only works for `id` values that are in
+    #       use in the database (as opposed to hypothetical `id` values).
+
+    # Extract the typecode portion, if any, of the specified `id`.
+    #
+    # Examples:
+    # - "nmdc:foo-123-456" → "foo"
+    # - "foo:nmdc-123-456" → `None`
+    #
+    pattern = re.compile(r"^nmdc:(\w+)?-")
+    match = pattern.search(doc_id)
+    typecode_portion = match.group(1) if match else None
+
+    if typecode_portion is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"The typecode portion of the specified `id` is invalid.",
+        )
+
+    # Determine the schema class, if any, of which the specified `id` could belong to an instance.
+    schema_class_name = None
+    for typecode in typecodes():
+        if typecode_portion == typecode["name"]:
+            schema_class_name_prefixed = typecode["schema_class"]
+            schema_class_name = schema_class_name_prefixed.replace("nmdc:", "", 1)
+            break
+
+    if schema_class_name is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"The specified `id` is not compatible with any schema classes.",
+        )
+
+    # Determine the Mongo collection(s) in which instances of that schema class can reside.
+    collection_names = []
+    DATABASE_CLASS_NAME = "Database"
+    schema_view = SchemaView(get_nmdc_schema_definition())
+    for slot_name in schema_view.class_slots(DATABASE_CLASS_NAME):
+        slot_definition = schema_view.induced_slot(slot_name, DATABASE_CLASS_NAME)
+
+        # If this slot doesn't represent a Mongo collection, abort this iteration.
+        if not (slot_definition.multivalued and slot_definition.inlined_as_list):
+            continue
+
+        # Determine the names of the classes whose instances can be stored in this collection.
+        name_of_eligible_class = slot_definition.range
+        names_of_eligible_classes = schema_view.class_descendants(
+            name_of_eligible_class
+        )
+        if schema_class_name in names_of_eligible_classes:
+            collection_names.append(slot_name)
+
+    if len(collection_names) == 0:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"The specified `id` is not compatible with any database collections.",
+        )
+
+    # Use the Mongo database to determine which of those collections a document having that `id` actually
+    # resides in, if any. If multiple collections contain such a document, report only the first one.
+    containing_collection_name = None
+    for collection_name in collection_names:
+        collection = mdb.get_collection(name=collection_name)
+        if collection.count_documents(dict(id=doc_id), limit=1) > 0:
+            containing_collection_name = collection_name
+            break
+
+    if containing_collection_name is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"The specified `id` does not belong to any documents.",
+        )
+
+    return {
+        "id": doc_id,
+        "collection_name": containing_collection_name,
+    }
+
+
 @router.get(
     "/nmdcschema/{collection_name}/{doc_id}",
     response_model=Doc,

diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py
@@ -279,3 +279,39 @@ def test_submit_workflow_activities(api_site_client):
     if doc_to_restore:
         mdb[test_collection].insert_one(doc_to_restore)
     assert "id" in rv.json() and "input_read_count" not in rv.json()
+
+
+def test_get_class_name_and_collection_names_by_doc_id():
+    base_url = os.getenv("API_HOST")
+
+    # Seed the database.
+    mdb = get_mongo_db()
+    study_set_collection = mdb.get_collection(name="study_set")
+    study_set_collection.insert_one(dict(id="nmdc:sty-1-foobar"))
+
+    # Valid `id`, and the document exists in database.
+    id_ = "nmdc:sty-1-foobar"
+    response = requests.request(
+        "GET",
+        f"{base_url}/nmdcschema/ids/{id_}/collection-name"
+    )
+    body = response.json()
+    assert response.status_code == 200
+    assert body["id"] == id_
+    assert body["collection_name"] == "study_set"
+
+    # Valid `id`, but the document does not exist in database.
+    id_ = "nmdc:sty-1-bazqux"
+    response = requests.request(
+        "GET",
+        f"{base_url}/nmdcschema/ids/{id_}/collection-name"
+    )
+    assert response.status_code == 404
+
+    # Invalid `id` (because "foo" is an invalid typecode).
+    id_ = "nmdc:foo-1-foobar"
+    response = requests.request(
+        "GET",
+        f"{base_url}/nmdcschema/ids/{id_}/collection-name"
+    )
+    assert response.status_code == 404