Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement API endpoint that returns name of collection containing document having a given id #532

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9461e15
Implement endpoint that returns collection and class name compatible …
eecavanna May 23, 2024
03f94b6
style: reformat
invalid-email-address May 23, 2024
630e5cb
clarify intended logic?
dwinston Jun 7, 2024
4c17335
Pluralize word in comment and check for truth instead of truthiness
eecavanna Jun 13, 2024
afe01be
Return HTTP 404 response when any requested information is not found
eecavanna Jun 13, 2024
42d2eef
style: reformat
invalid-email-address Jun 13, 2024
46d3628
Update endpoint path to be consistent with behavior
eecavanna Jun 13, 2024
3957a50
Add comment about using schema instead of real database
eecavanna Jun 13, 2024
2762580
Implement tests targeting newly-implemented API endpoint
eecavanna Jun 13, 2024
bcb8cc4
Remove reference to undefined variable (oops)
eecavanna Jun 13, 2024
a1dca11
Include base URL when submitting HTTP requests in test
eecavanna Jun 13, 2024
e9fe984
Return name of collection, if any, in which document actually exists
eecavanna Jun 13, 2024
4647563
Fix bug where database object was being used as client object
eecavanna Jun 13, 2024
276a15a
Rename JSON properties to describe their values more specifically
eecavanna Jun 13, 2024
a46d9bf
Fix typos in comment
eecavanna Jun 13, 2024
a7e0baa
Refrain from counting more documents than necessary
eecavanna Jun 13, 2024
86ef100
Remove "hypothetical" information from API response
eecavanna Jun 13, 2024
e2c3977
Update test to use new URL path (oops)
eecavanna Jun 13, 2024
53c62ec
Respond with HTTP 404 when document having `id` is not found
eecavanna Jun 13, 2024
d3426e9
style: reformat
invalid-email-address Jun 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions nmdc_runtime/api/endpoints/nmdcschema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from importlib.metadata import version
import re

import pymongo
from fastapi import APIRouter, Depends, HTTPException
Expand All @@ -8,6 +9,8 @@
from pymongo.database import Database as MongoDatabase
from starlette import status
from toolz import dissoc
from linkml_runtime.utils.schemaview import SchemaView
from nmdc_schema.nmdc_data import get_nmdc_schema_definition

from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id
from nmdc_runtime.api.core.util import raise404_if_none
Expand Down Expand Up @@ -131,6 +134,103 @@ def get_by_id(
)


@router.get("/nmdcschema/ids/{doc_id}/collection-name")
def get_collection_name_by_doc_id(
doc_id: str,
mdb: MongoDatabase = Depends(get_mongo_db),
):
r"""
Returns the name of the collection, if any, containing the document having the specified `id`.

This endpoint uses the NMDC Schema to determine the schema class of which an instance could have
the specified value as its `id`; and then uses the NMDC Schema to determine the names of the
`Database` slots (i.e. Mongo collection names) that could contain instances of that schema class.

This endpoint then searches those Mongo collections for a document having that `id`.
If it finds one, it responds with the name of the collection containing the document.
If it does not find one, it response with an `HTTP 404 Not Found` response.
"""
# Note: The `nmdc_runtime.api.core.metadata.map_id_to_collection` function is
# not used here because that function (a) only processes collections whose
# names end with `_set` and (b) only works for `id` values that are in
# use in the database (as opposed to hypothetical `id` values).

# Extract the typecode portion, if any, of the specified `id`.
#
# Examples:
# - "nmdc:foo-123-456" → "foo"
# - "foo:nmdc-123-456" → `None`
#
pattern = re.compile(r"^nmdc:(\w+)?-")
match = pattern.search(doc_id)
typecode_portion = match.group(1) if match else None

if typecode_portion is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The typecode portion of the specified `id` is invalid.",
)

# Determine the schema class, if any, of which the specified `id` could belong to an instance.
schema_class_name = None
for typecode in typecodes():
if typecode_portion == typecode["name"]:
eecavanna marked this conversation as resolved.
Show resolved Hide resolved
schema_class_name_prefixed = typecode["schema_class"]
schema_class_name = schema_class_name_prefixed.replace("nmdc:", "", 1)
break

if schema_class_name is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` is not compatible with any schema classes.",
)

# Determine the Mongo collection(s) in which instances of that schema class can reside.
collection_names = []
DATABASE_CLASS_NAME = "Database"
schema_view = SchemaView(get_nmdc_schema_definition())
for slot_name in schema_view.class_slots(DATABASE_CLASS_NAME):
slot_definition = schema_view.induced_slot(slot_name, DATABASE_CLASS_NAME)

# If this slot doesn't represent a Mongo collection, abort this iteration.
if not (slot_definition.multivalued and slot_definition.inlined_as_list):
continue

# Determine the names of the classes whose instances can be stored in this collection.
name_of_eligible_class = slot_definition.range
names_of_eligible_classes = schema_view.class_descendants(
name_of_eligible_class
)
if schema_class_name in names_of_eligible_classes:
collection_names.append(slot_name)

if len(collection_names) == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` is not compatible with any database collections.",
)

# Use the Mongo database to determine which of those collections a document having that `id` actually
# resides in, if any. If multiple collections contain such a document, report only the first one.
containing_collection_name = None
for collection_name in collection_names:
collection = mdb.get_collection(name=collection_name)
if collection.count_documents(dict(id=doc_id), limit=1) > 0:
containing_collection_name = collection_name
break

if containing_collection_name is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"The specified `id` does not belong to any documents.",
)

return {
"id": doc_id,
"collection_name": containing_collection_name,
}


@router.get(
"/nmdcschema/{collection_name}/{doc_id}",
response_model=Doc,
Expand Down
36 changes: 36 additions & 0 deletions tests/test_api/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,39 @@ def test_submit_workflow_activities(api_site_client):
if doc_to_restore:
mdb[test_collection].insert_one(doc_to_restore)
assert "id" in rv.json() and "input_read_count" not in rv.json()


def test_get_class_name_and_collection_names_by_doc_id():
base_url = os.getenv("API_HOST")

# Seed the database.
mdb = get_mongo_db()
study_set_collection = mdb.get_collection(name="study_set")
study_set_collection.insert_one(dict(id="nmdc:sty-1-foobar"))

# Valid `id`, and the document exists in database.
id_ = "nmdc:sty-1-foobar"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
body = response.json()
assert response.status_code == 200
assert body["id"] == id_
assert body["collection_name"] == "study_set"

# Valid `id`, but the document does not exist in database.
id_ = "nmdc:sty-1-bazqux"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
assert response.status_code == 404

# Invalid `id` (because "foo" is an invalid typecode).
id_ = "nmdc:foo-1-foobar"
response = requests.request(
"GET",
f"{base_url}/nmdcschema/ids/{id_}/collection-name"
)
assert response.status_code == 404