Skip to content

Commit

Permalink
Merge pull request #532 from microbiomedata/531-implement-api-endpoin…
Browse files Browse the repository at this point in the history
…t-that-accepts-id-value-and-returns-its-class-name-and-collection-name

Implement API endpoint that returns name of collection containing document having a given `id`
  • Loading branch information
eecavanna authored Jun 13, 2024
2 parents 8de5ef1 + d3426e9 commit 6bcf735
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 0 deletions.
100 changes: 100 additions & 0 deletions nmdc_runtime/api/endpoints/nmdcschema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from importlib.metadata import version
import re

import pymongo
from fastapi import APIRouter, Depends, HTTPException
Expand All @@ -8,6 +9,8 @@
from pymongo.database import Database as MongoDatabase
from starlette import status
from toolz import dissoc
from linkml_runtime.utils.schemaview import SchemaView
from nmdc_schema.nmdc_data import get_nmdc_schema_definition

from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id
from nmdc_runtime.api.core.util import raise404_if_none
Expand Down Expand Up @@ -131,6 +134,103 @@ def get_by_id(
)


@router.get("/nmdcschema/ids/{doc_id}/collection-name")
def get_collection_name_by_doc_id(
doc_id: str,
mdb: MongoDatabase = Depends(get_mongo_db),
):
r"""
Returns the name of the collection, if any, containing the document having the specified `id`.
This endpoint uses the NMDC Schema to determine the schema class of which an instance could have
the specified value as its `id`; and then uses the NMDC Schema to determine the names of the
`Database` slots (i.e. Mongo collection names) that could contain instances of that schema class.
This endpoint then searches those Mongo collections for a document having that `id`.
If it finds one, it responds with the name of the collection containing the document.
If it does not find one, it response with an `HTTP 404 Not Found` response.
"""
# Note: The `nmdc_runtime.api.core.metadata.map_id_to_collection` function is
# not used here because that function (a) only processes collections whose
# names end with `_set` and (b) only works for `id` values that are in
# use in the database (as opposed to hypothetical `id` values).

# Extract the typecode portion, if any, of the specified `id`.
#
# Examples:
# - "nmdc:foo-123-456" → "foo"
# - "foo:nmdc-123-456" → `None`
#
pattern = re.compile(r"^nmdc:(\w+)?-")
match = pattern.search(doc_id)
typecode_portion = match.group(1) if match else None

if typecode_portion is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The typecode portion of the specified `id` is invalid.",
)

# Determine the schema class, if any, of which the specified `id` could belong to an instance.
schema_class_name = None
for typecode in typecodes():
if typecode_portion == typecode["name"]:
schema_class_name_prefixed = typecode["schema_class"]
schema_class_name = schema_class_name_prefixed.replace("nmdc:", "", 1)
break

if schema_class_name is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` is not compatible with any schema classes.",
)

# Determine the Mongo collection(s) in which instances of that schema class can reside.
collection_names = []
DATABASE_CLASS_NAME = "Database"
schema_view = SchemaView(get_nmdc_schema_definition())
for slot_name in schema_view.class_slots(DATABASE_CLASS_NAME):
slot_definition = schema_view.induced_slot(slot_name, DATABASE_CLASS_NAME)

# If this slot doesn't represent a Mongo collection, abort this iteration.
if not (slot_definition.multivalued and slot_definition.inlined_as_list):
continue

# Determine the names of the classes whose instances can be stored in this collection.
name_of_eligible_class = slot_definition.range
names_of_eligible_classes = schema_view.class_descendants(
name_of_eligible_class
)
if schema_class_name in names_of_eligible_classes:
collection_names.append(slot_name)

if len(collection_names) == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` is not compatible with any database collections.",
)

# Use the Mongo database to determine which of those collections a document having that `id` actually
# resides in, if any. If multiple collections contain such a document, report only the first one.
containing_collection_name = None
for collection_name in collection_names:
collection = mdb.get_collection(name=collection_name)
if collection.count_documents(dict(id=doc_id), limit=1) > 0:
containing_collection_name = collection_name
break

if containing_collection_name is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` does not belong to any documents.",
)

return {
"id": doc_id,
"collection_name": containing_collection_name,
}


@router.get(
"/nmdcschema/{collection_name}/{doc_id}",
response_model=Doc,
Expand Down
36 changes: 36 additions & 0 deletions tests/test_api/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,39 @@ def test_submit_workflow_activities(api_site_client):
if doc_to_restore:
mdb[test_collection].insert_one(doc_to_restore)
assert "id" in rv.json() and "input_read_count" not in rv.json()


def test_get_class_name_and_collection_names_by_doc_id():
base_url = os.getenv("API_HOST")

# Seed the database.
mdb = get_mongo_db()
study_set_collection = mdb.get_collection(name="study_set")
study_set_collection.insert_one(dict(id="nmdc:sty-1-foobar"))

# Valid `id`, and the document exists in database.
id_ = "nmdc:sty-1-foobar"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
body = response.json()
assert response.status_code == 200
assert body["id"] == id_
assert body["collection_name"] == "study_set"

# Valid `id`, but the document does not exist in database.
id_ = "nmdc:sty-1-bazqux"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
assert response.status_code == 404

# Invalid `id` (because "foo" is an invalid typecode).
id_ = "nmdc:foo-1-foobar"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
assert response.status_code == 404

0 comments on commit 6bcf735

Please sign in to comment.