diff --git a/nmdc_runtime/api/endpoints/find.py b/nmdc_runtime/api/endpoints/find.py index a52ecba2..97499e9f 100644 --- a/nmdc_runtime/api/endpoints/find.py +++ b/nmdc_runtime/api/endpoints/find.py @@ -1,7 +1,7 @@ from operator import itemgetter from typing import List, Annotated -from fastapi import APIRouter, Depends, Form, Path +from fastapi import APIRouter, Depends, Form, Path, Query from jinja2 import Environment, PackageLoader, select_autoescape from nmdc_runtime.minter.config import typecodes from nmdc_runtime.util import get_nmdc_jsonschema_dict @@ -21,15 +21,12 @@ find_resources, strip_oid, find_resources_spanning, - pipeline_find_resources, ) from nmdc_runtime.api.models.metadata import Doc from nmdc_runtime.api.models.util import ( FindResponse, FindRequest, entity_attributes_to_index, - PipelineFindRequest, - PipelineFindResponse, ) from nmdc_runtime.util import get_class_names_from_collection_spec @@ -42,7 +39,7 @@ response_model_exclude_unset=True, ) def find_studies( - req: FindRequest = Depends(), + req: Annotated[FindRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -58,7 +55,14 @@ def find_studies( response_model_exclude_unset=True, ) def find_study_by_id( - study_id: str, + study_id: Annotated[ + str, + Path( + title="Study ID", + description="The `id` of the `Study` you want to find.\n\n_Example_: `nmdc:sty-11-abc123`", + examples=["nmdc:sty-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -74,7 +78,7 @@ def find_study_by_id( response_model_exclude_unset=True, ) def find_biosamples( - req: FindRequest = Depends(), + req: Annotated[FindRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -90,7 +94,14 @@ def find_biosamples( response_model_exclude_unset=True, ) def find_biosample_by_id( - sample_id: str, + sample_id: Annotated[ + str, + Path( + title="Biosample ID", + description="The `id` of the `Biosample` you want to find.\n\n_Example_: `nmdc:bsm-11-abc123`", + examples=["nmdc:bsm-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -106,7 +117,7 @@ def find_biosample_by_id( response_model_exclude_unset=True, ) def find_data_objects( - req: FindRequest = Depends(), + req: Annotated[FindRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -135,9 +146,22 @@ def get_classname_from_typecode(doc_id: str) -> str: @router.get( "/data_objects/study/{study_id}", response_model_exclude_unset=True, + # Note: We include a description here so that FastAPI does not use the function's docstring as the API endpoint's + # description. The docstring currently contains non-user-facing information, such as mentioning the + # implementation detail of using the `alldocs` collection under the hood, and mentioning function parameters + # that are not API request parameters. + description="Gets all `DataObject`s related to all `Biosample`s related to the specified `Study`.", ) def find_data_objects_for_study( - study_id: str, + study_id: Annotated[ + str, + Path( + title="Study ID", + description="""The `id` of the `Study` having `Biosample`s with which you want to find + associated `DataObject`s.\n\n_Example_: `nmdc:sty-11-abc123`""", + examples=["nmdc:sty-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): """This API endpoint is used to retrieve data objects associated with @@ -256,7 +280,14 @@ def process_informed_by_docs(doc, collected_objects, unique_ids): response_model_exclude_unset=True, ) def find_data_object_by_id( - data_object_id: str, + data_object_id: Annotated[ + str, + Path( + title="DataObject ID", + description="The `id` of the `DataObject` you want to find.\n\n_Example_: `nmdc:dobj-11-abc123`", + examples=["nmdc:dobj-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): """ @@ -274,19 +305,17 @@ def find_data_object_by_id( response_model_exclude_unset=True, ) def find_planned_processes( - req: FindRequest = Depends(), + req: Annotated[FindRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): - # TODO: Add w3id URL links for classes (e.g. ) when they resolve - # to Berkeley schema definitions. """ The GET /planned_processes endpoint is a general way to fetch metadata about various planned processes (e.g. workflow execution, material processing, etc.). Any "slot" (a.k.a. attribute) for - `PlannedProcess` may be used in the filter + [`PlannedProcess`](https://w3id.org/nmdc/PlannedProcess) may be used in the filter and sort parameters, including attributes of subclasses of *PlannedProcess*. - For example, attributes used in subclasses such as `Extraction` (subclass of *PlannedProcess*), - can be used as input criteria for the filter and sort parameters of this endpoint. + For example, attributes used in subclasses such as [`Extraction`](https://w3id.org/nmdc/Extraction) + (subclass of *PlannedProcess*), can be used as input criteria for the filter and sort parameters of this endpoint. """ return find_resources_spanning( req, @@ -346,13 +375,17 @@ def attr_index_sort_key(attr): def documentation_links(jsonschema_dict, collection_names) -> dict: - """TODO: Add a docstring saying what this function does at a high level.""" + """This function constructs a hierarchical catalog of (links to) schema classes and their slots. - # TODO: Document the purpose of this initial key. - doc_links = {"Activity": []} + The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html` + in order to support user experience for `GET /search`. + """ # Note: All documentation URLs generated within this function will begin with this. - base_url = r"https://microbiomedata.github.io/nmdc-schema" + base_url = r"https://w3id.org/nmdc" + + # Initialize dictionary in which to associate key/value pairs via the following for loop. + doc_links = {} for collection_name in collection_names: # Since a given collection can be associated with multiple classes, the `doc_links` dictionary @@ -398,7 +431,7 @@ def documentation_links(jsonschema_dict, collection_names) -> dict: return doc_links -@router.get("/search", response_class=HTMLResponse) +@router.get("/search", response_class=HTMLResponse, include_in_schema=False) def search_page( mdb: MongoDatabase = Depends(get_mongo_db), ): @@ -423,38 +456,3 @@ def search_page( doc_links=doc_links, ) return HTMLResponse(content=html_content, status_code=200) - - -@router.post( - "/pipeline_search", - response_model=PipelineFindResponse, - response_model_exclude_unset=True, -) -def pipeline_search( - req: PipelineFindRequest = Depends(), - mdb: MongoDatabase = Depends(get_mongo_db), -): - return pipeline_find_resources(req, mdb) - - -@router.post( - "/pipeline_search_form", - response_model=PipelineFindResponse, - response_model_exclude_unset=True, -) -def pipeline_search( - pipeline_spec: str = Form(...), - description: str = Form(...), - mdb: MongoDatabase = Depends(get_mongo_db), -): - req = PipelineFindRequest(pipeline_spec=pipeline_spec, description=description) - return pipeline_find_resources(req, mdb) - - -@router.get("/pipeline_search", response_class=HTMLResponse) -def pipeline_search( - mdb: MongoDatabase = Depends(get_mongo_db), -): - template = jinja_env.get_template("pipeline_search.html") - html_content = template.render() - return HTMLResponse(content=html_content, status_code=200) diff --git a/nmdc_runtime/api/endpoints/jobs.py b/nmdc_runtime/api/endpoints/jobs.py index 1248d6b0..b34829c3 100644 --- a/nmdc_runtime/api/endpoints/jobs.py +++ b/nmdc_runtime/api/endpoints/jobs.py @@ -1,8 +1,8 @@ import json -from typing import Optional +from typing import Optional, Annotated import pymongo -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, Query from nmdc_runtime.api.core.util import ( raise404_if_none, @@ -25,7 +25,7 @@ "/jobs", response_model=ListResponse[Job], response_model_exclude_unset=True ) def list_jobs( - req: ListRequest = Depends(), + req: Annotated[ListRequest, Query()], mdb: pymongo.database.Database = Depends(get_mongo_db), maybe_site: Optional[Site] = Depends(maybe_get_current_client_site), ): @@ -55,22 +55,3 @@ def claim_job( site: Site = Depends(get_current_client_site), ): return _claim_job(job_id, mdb, site) - - -@router.get( - "/jobs/{job_id}/executions", - description=( - "A sub-resource of a job resource, the result of a successful run of that job. " - "An execution resource may be retrieved by any site; however, it may be created " - "and updated only by the site that ran its job." - ), -) -def list_job_executions(): - # TODO - pass - - -@router.get("/jobs/{job_id}/executions/{exec_id}") -def get_job_execution(): - # TODO - pass diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py index 9d389298..5b30c77f 100644 --- a/nmdc_runtime/api/endpoints/metadata.py +++ b/nmdc_runtime/api/endpoints/metadata.py @@ -6,10 +6,11 @@ from collections import defaultdict from copy import deepcopy from io import StringIO +from typing import Annotated import requests from dagster import ExecuteInProcessResult -from fastapi import APIRouter, Depends, File, HTTPException, UploadFile +from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, Path from gridfs import GridFS, NoFile from jsonschema import Draft7Validator from nmdc_runtime.api.core.metadata import _validate_changesheet, df_from_sheet_in @@ -43,6 +44,10 @@ async def raw_changesheet_from_uploaded_file(uploaded_file: UploadFile): + """ + Extract utf8-encoded text from fastapi.UploadFile object, and + construct ChangesheetIn object for subsequent processing. + """ content_type = uploaded_file.content_type name = uploaded_file.filename if name.endswith(".csv"): @@ -56,14 +61,14 @@ async def raw_changesheet_from_uploaded_file(uploaded_file: UploadFile): @router.post("/metadata/changesheets:validate") async def validate_changesheet( - uploaded_file: UploadFile = File(...), + uploaded_file: UploadFile = File( + ..., description="The changesheet you want the server to validate" + ), mdb: MongoDatabase = Depends(get_mongo_db), ): - """ - - Example changesheet - [here](https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/notebooks/data/changesheet-without-separator3.tsv). - + r""" + Validates a [changesheet](https://microbiomedata.github.io/nmdc-runtime/howto-guides/author-changesheets/) + that is in either CSV or TSV format. """ sheet_in = await raw_changesheet_from_uploaded_file(uploaded_file) df_change = df_from_sheet_in(sheet_in, mdb) @@ -72,16 +77,22 @@ async def validate_changesheet( @router.post("/metadata/changesheets:submit", response_model=DrsObjectWithTypes) async def submit_changesheet( - uploaded_file: UploadFile = File(...), + uploaded_file: UploadFile = File( + ..., description="The changesheet you want the server to apply" + ), mdb: MongoDatabase = Depends(get_mongo_db), user: User = Depends(get_current_active_user), ): - """ - - Example changesheet - [here](https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/notebooks/data/changesheet-without-separator3.tsv). + r""" + Applies a [changesheet](https://microbiomedata.github.io/nmdc-runtime/howto-guides/author-changesheets/) + that is in either CSV or TSV format. + **Note:** This endpoint is only accessible to users that have been granted access by a Runtime administrator. """ + # TODO: Allow users to determine whether they have that access (i.e. whether they are allowed to perform the + # `/metadata/changesheets:submit` action), themselves, so that they don't have to contact an admin + # or submit an example changesheet in order to find that out. + if not permitted(user.username, "/metadata/changesheets:submit"): raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, @@ -111,11 +122,30 @@ async def submit_changesheet( return doc_after -@router.get("/metadata/stored_files/{object_id}") +@router.get("/metadata/stored_files/{object_id}", include_in_schema=False) async def get_stored_metadata_object( - object_id: str, + object_id: Annotated[ + str, + Path( + title="Metadata file ObjectId", + description="The ObjectId (`_id`) of the metadata file you want to get.\n\n_Example_: `507f1f77bcf86cd799439011`", + examples=["507f1f77bcf86cd799439011"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): + r""" + This endpoint is subservient to our Data Repository Service (DRS) implementation, i.e. the `/objects/*` endpoints. + In particular, URLs resolving to this route are generated + by the DRS `/objects/{object_id}/access/{access_id}` endpoint if we store the raw object in our MongoDB via GridFS. + We currently do this for request bodies for `/metadata/json:submit` and `/metadata/changesheets:submit`. + A typical API user would not call this endpoint directly. Rather, it merely forms part of the API surface. + Therefore, we do not include it in the OpenAPI schema. + + References: + - https://pymongo.readthedocs.io/en/stable/examples/gridfs.html + - https://www.mongodb.com/docs/manual/core/gridfs/#use-gridfs + """ mdb_fs = GridFS(mdb) try: grid_out = mdb_fs.get(object_id) @@ -136,87 +166,6 @@ def iter_grid_out(): ) -url_pattern = re.compile(r"https?://(?P[^/]+)/(?P.+)") - - -def url_to_name(url): - m = url_pattern.match(url) - return f"{'.'.join(reversed(m.group('domain').split('.')))}__{m.group('path').replace('/', '.')}" - - -def result_for_url_to_json_file(data, url, save_dir): - with open(os.path.join(save_dir, url_to_name(url)), "w") as f: - json.dump(data.json(), f) - - -def fetch_downloaded_json(url, save_dir): - with open(os.path.join(save_dir, url_to_name(url))) as f: - return json.load(f) - - -@router.post("/metadata/json:validate_urls_file") -async def validate_json_urls_file(urls_file: UploadFile = File(...)): - """ - - Given a text file with one URL per line, will try to validate each URL target - as a NMDC JSON Schema "nmdc:Database" object. - - """ - content_type = urls_file.content_type - filename = urls_file.filename - if content_type != "text/plain": - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=( - f"file {filename} has content type '{content_type}'. " - f"Only 'text/plain' (*.txt) files are permitted." - ), - ) - contents: bytes = await urls_file.read() - stream = StringIO(contents.decode()) # can e.g. import csv; csv.reader(stream) - - urls = [line.strip() for line in stream if line.strip()] - - def load_url(url, timeout): - return requests.get(url, timeout=timeout) - - with tempfile.TemporaryDirectory() as temp_dir: - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - future_to_url = {executor.submit(load_url, url, 5): url for url in urls} - for future in concurrent.futures.as_completed(future_to_url): - url = future_to_url[future] - try: - data = future.result() - result_for_url_to_json_file(data, url, temp_dir) - except Exception as exc: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"{url} generated an exception: {exc}", - ) - - validator = Draft7Validator(get_nmdc_jsonschema_dict()) - validation_errors = defaultdict(list) - - for url in urls: - docs = fetch_downloaded_json(url, temp_dir) - docs, validation_errors_for_activity_set = specialize_activity_set_docs( - docs - ) - - validation_errors["activity_set"].extend( - validation_errors_for_activity_set["activity_set"] - ) - - for coll_name, coll_docs in docs.items(): - errors = list(validator.iter_errors({coll_name: coll_docs})) - validation_errors[coll_name].extend([e.message for e in errors]) - - if all(len(v) == 0 for v in validation_errors.values()): - return {"result": "All Okay!"} - else: - return {"result": "errors", "detail": validation_errors} - - @router.post("/metadata/json:validate", name="Validate JSON") async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)): """ diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py index ec4c2be7..9fcfdedd 100644 --- a/nmdc_runtime/api/endpoints/nmdcschema.py +++ b/nmdc_runtime/api/endpoints/nmdcschema.py @@ -1,9 +1,9 @@ from importlib.metadata import version import re -from typing import List, Dict +from typing import List, Dict, Annotated import pymongo -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Path, Query from nmdc_runtime.config import DATABASE_CLASS_NAME from nmdc_runtime.minter.config import typecodes @@ -21,7 +21,11 @@ get_nonempty_nmdc_schema_collection_names, get_collection_names_from_schema, ) -from nmdc_runtime.api.endpoints.util import list_resources +from nmdc_runtime.api.endpoints.util import ( + list_resources, + strip_oid, + comma_separated_values, +) from nmdc_runtime.api.models.metadata import Doc from nmdc_runtime.api.models.util import ListRequest, ListResponse @@ -29,6 +33,9 @@ def ensure_collection_name_is_known_to_schema(collection_name: str): + r""" + Raises an exception if the specified string is _not_ the name of a collection described by the NMDC Schema. + """ names = get_collection_names_from_schema() if collection_name not in names: raise HTTPException( @@ -37,15 +44,13 @@ def ensure_collection_name_is_known_to_schema(collection_name: str): ) -def strip_oid(doc): - return dissoc(doc, "_id") - - @router.get("/nmdcschema/version") def get_nmdc_schema_version(): - """ - To view the [NMDC Schema](https://microbiomedata.github.io/nmdc-schema/) version the database is currently using, - try executing the GET /nmdcschema/version endpoint + r""" + Returns a string indicating which version of the [NMDC Schema](https://microbiomedata.github.io/nmdc-schema/) + the Runtime is using. + + **Note:** The same information—and more—is also available via the `/version` endpoint. """ return version("nmdc_schema") @@ -108,15 +113,25 @@ def get_nmdc_database_collection_stats( "/nmdcschema/{collection_name}", response_model=ListResponse[Doc], response_model_exclude_unset=True, - dependencies=[Depends(ensure_collection_name_is_known_to_schema)], ) def list_from_collection( - collection_name: str, - req: ListRequest = Depends(), + collection_name: Annotated[ + str, + Path( + title="Collection name", + description="The name of the collection.\n\n_Example_: `biosample_set`", + examples=["biosample_set"], + ), + ], + req: Annotated[ListRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): - """ - Returns resources that match the specified filter criteria and reside in the specified collection. + r""" + Retrieves resources that match the specified filter criteria and reside in the specified collection. + + Searches the specified collection for documents matching the specified `filter` criteria. + If the `projection` parameter is used, each document in the response will only include + the fields specified by that parameter (plus the `id` field). You can get all the valid collection names from the [Database class](https://microbiomedata.github.io/nmdc-schema/Database/) page of the NMDC Schema documentation. @@ -130,12 +145,12 @@ def list_from_collection( the regex is a [prefix expression](https://www.mongodb.com/docs/manual/reference/operator/query/regex/#index-use), That will allow MongoDB to optimize the way it uses the regex, making this API endpoint respond faster. """ + # TODO: The note about collection names above is currently accurate, but will not necessarily always be accurate, # since the `Database` class could eventually have slots that aren't `multivalued` and `inlined_as_list`, - # which are things NMDC Schema maintainers say a `Database` slot must be in order for it to represent - # a MongoDB collection. + # which are traits a `Database` slot must have in order for it to represent a MongoDB collection. # - # TODO: Implement an API endpoint that returns all valid collection names (can get them via a `SchemaView`), + # TODO: Implement an API endpoint that returns all valid collection names (it can get them via a `SchemaView`), # Then replace the note above with a suggestion that the user access that API endpoint. rv = list_resources(req, mdb, collection_name) @@ -149,12 +164,18 @@ def list_from_collection( response_model_exclude_unset=True, ) def get_by_id( - doc_id: str, + doc_id: Annotated[ + str, + Path( + title="Document ID", + description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`", + examples=["nmdc:bsm-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): - """ - If the identifier of the record is known, the GET /nmdcshema/ids/{doc_id} can be used to retrieve the specified record. - \n Note that only one identifier may be used at a time, and therefore, only one record may be retrieved at a time using this method. + r""" + Retrieves the document having the specified `id`, regardless of which schema-described collection it resides in. """ id_dict = map_id_to_collection(mdb) collection_name = get_collection_for_id(doc_id, id_dict) @@ -167,7 +188,14 @@ def get_by_id( @router.get("/nmdcschema/ids/{doc_id}/collection-name") def get_collection_name_by_doc_id( - doc_id: str, + doc_id: Annotated[ + str, + Path( + title="Document ID", + description="The `id` of the document.\n\n_Example_: `nmdc:bsm-11-abc123`", + examples=["nmdc:bsm-11-abc123"], + ), + ], mdb: MongoDatabase = Depends(get_mongo_db), ): r""" @@ -265,23 +293,45 @@ def get_collection_name_by_doc_id( "/nmdcschema/{collection_name}/{doc_id}", response_model=Doc, response_model_exclude_unset=True, - dependencies=[Depends(ensure_collection_name_is_known_to_schema)], ) def get_from_collection_by_id( - collection_name: str, - doc_id: str, - projection: str | None = None, + collection_name: Annotated[ + str, + Path( + title="Collection name", + description="The name of the collection.\n\n_Example_: `biosample_set`", + examples=["biosample_set"], + ), + ], + doc_id: Annotated[ + str, + Path( + title="Document ID", + description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`", + examples=["nmdc:bsm-11-abc123"], + ), + ], + projection: Annotated[ + str | None, + Query( + title="Projection", + description="""Comma-delimited list of the names of the fields you want the document in the response to + include.\n\n_Example_: `id,name,ecosystem_type`""", + examples=[ + "id,name,ecosystem_type", + ], + ), + ] = None, mdb: MongoDatabase = Depends(get_mongo_db), ): + r""" + Retrieves the document having the specified `id`, from the specified collection; optionally, including only the + fields specified via the `projection` parameter. """ - If both the identifier and the collection name of the desired record is known, the - GET /nmdcschema/{collection_name}/{doc_id} can be used to retrieve the record. The projection parameter is optionally - available for this endpoint to retrieve only desired attributes from a record. Please note that only one record can - be retrieved at one time using this method. + # Note: This helper function will raise an exception if the collection name is invalid. + ensure_collection_name_is_known_to_schema(collection_name) - for MongoDB-like [projection](https://www.mongodb.com/docs/manual/tutorial/project-fields-from-query-results/): comma-separated list of fields you want the objects in the response to include. Example: `id,doi` - """ - projection = projection.split(",") if projection else None + projection = comma_separated_values(projection) if projection else None try: return strip_oid( raise404_if_none( diff --git a/nmdc_runtime/api/endpoints/objects.py b/nmdc_runtime/api/endpoints/objects.py index 14760902..e9d1bf69 100644 --- a/nmdc_runtime/api/endpoints/objects.py +++ b/nmdc_runtime/api/endpoints/objects.py @@ -1,7 +1,7 @@ -from typing import List +from typing import List, Annotated import botocore -from fastapi import APIRouter, status, Depends, HTTPException +from fastapi import APIRouter, status, Depends, HTTPException, Query from gridfs import GridFS from pymongo import ReturnDocument from pymongo.database import Database as MongoDatabase @@ -91,7 +91,7 @@ def create_object( @router.get("/objects", response_model=ListResponse[DrsObject]) def list_objects( - req: ListRequest = Depends(), + req: Annotated[ListRequest, Query()], mdb: MongoDatabase = Depends(get_mongo_db), ): return list_resources(req, mdb, "objects") diff --git a/nmdc_runtime/api/endpoints/operations.py b/nmdc_runtime/api/endpoints/operations.py index c6bcccf2..d046e6dd 100644 --- a/nmdc_runtime/api/endpoints/operations.py +++ b/nmdc_runtime/api/endpoints/operations.py @@ -1,5 +1,7 @@ +from typing import Annotated + import pymongo -from fastapi import APIRouter, Depends, status, HTTPException +from fastapi import APIRouter, Depends, status, HTTPException, Query from toolz import get_in, merge, assoc from nmdc_runtime.api.core.util import raise404_if_none, pick @@ -20,7 +22,7 @@ @router.get("/operations", response_model=ListOperationsResponse[ResultT, MetadataT]) def list_operations( - req: ListRequest = Depends(), + req: Annotated[ListRequest, Query()], mdb: pymongo.database.Database = Depends(get_mongo_db), ): return list_resources(req, mdb, "operations") diff --git a/nmdc_runtime/api/endpoints/sites.py b/nmdc_runtime/api/endpoints/sites.py index 273b1407..bbed6ccc 100644 --- a/nmdc_runtime/api/endpoints/sites.py +++ b/nmdc_runtime/api/endpoints/sites.py @@ -1,8 +1,8 @@ -from typing import List +from typing import List, Annotated import botocore import pymongo.database -from fastapi import APIRouter, Depends, status, HTTPException, Path +from fastapi import APIRouter, Depends, status, HTTPException, Path, Query from starlette.status import HTTP_403_FORBIDDEN from nmdc_runtime.api.core.auth import ( @@ -74,7 +74,7 @@ def create_site( "/sites", response_model=ListResponse[Site], response_model_exclude_unset=True ) def list_sites( - req: ListRequest = Depends(), + req: Annotated[ListRequest, Query()], mdb: pymongo.database.Database = Depends(get_mongo_db), ): return list_resources(req, mdb, "sites") diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 63118abc..0e24500d 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -44,8 +44,6 @@ FindRequest, FindResponse, ListRequest, - PipelineFindRequest, - PipelineFindResponse, ResultT, ) from nmdc_runtime.util import drs_metadata_for @@ -61,6 +59,7 @@ def check_filter(filter_: str): + """A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise.""" filter_ = filter_.strip() if not filter_.startswith("{") or not filter_.endswith("}"): raise HTTPException( @@ -144,7 +143,11 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str): return {"resources": resources, "next_page_token": token} -def maybe_unstring(val): +def coerce_to_float_if_possible(val): + r""" + Converts the specified value into a floating-point number if possible; + raising a `ValueError` if not possible. + """ try: return float(val) except ValueError: @@ -152,10 +155,26 @@ def maybe_unstring(val): def comma_separated_values(s: str): - return [v.strip() for v in re.split(r"\s*,\s*", s)] + r""" + Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace + surrounding each substring. + + Reference: https://docs.python.org/3/library/re.html#re.split + + >>> comma_separated_values("apple, banana, cherry") + ['apple', 'banana', 'cherry'] + """ + return [v.strip() for v in s.split(",")] def get_mongo_filter(filter_str): + r""" + Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter` + -- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator + (e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name + then the server should perform a full-text search + -- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call. + """ filter_ = {} if not filter_str: return filter_ @@ -174,7 +193,7 @@ def get_mongo_filter(filter_str): else: for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}: if spec.startswith(op): - filter_[attr] = {key: maybe_unstring(spec[len(op) :])} + filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])} break else: filter_[attr] = spec @@ -182,6 +201,11 @@ def get_mongo_filter(filter_str): def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]: + """ + Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*", + where spec is `asc` (ascending -- the default if no spec) or `desc` (descending), + and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call. + """ sort_ = [] if not sort_str: return None @@ -209,7 +233,10 @@ def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]: return sort_ -def strip_oid(doc): +def strip_oid(doc: dict) -> dict: + r""" + Returns a copy of the specified dictionary, that has no `_id` key. + """ return dissoc(doc, "_id") @@ -222,6 +249,9 @@ def timeit(cursor): def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str): + r""" + TODO: Document this function. + """ if req.group_by: raise HTTPException( status_code=status.HTTP_418_IM_A_TEAPOT, @@ -347,6 +377,9 @@ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str): def find_resources_spanning( req: FindRequest, mdb: MongoDatabase, collection_names: Set[str] ): + r""" + TODO: Document this function. + """ if req.cursor or not req.page: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -386,10 +419,16 @@ def find_resources_spanning( def exists(collection: MongoCollection, filter_: dict): + r""" + TODO: Document this function. + """ return collection.count_documents(filter_) > 0 def find_for(resource: str, req: FindRequest, mdb: MongoDatabase): + r""" + TODO: Document this function. + """ if resource == "biosamples": return find_resources(req, mdb, "biosample_set") elif resource == "studies": @@ -408,29 +447,6 @@ def find_for(resource: str, req: FindRequest, mdb: MongoDatabase): ) -def pipeline_find_resources(req: PipelineFindRequest, mdb: MongoDatabase): - description = req.description - components = [c.strip() for c in re.split(r"\s*\n\s*\n\s*", req.pipeline_spec)] - print(components) - for c in components: - if c.startswith("/"): - parse_result = urlparse(c) - resource = parse_result.path[1:] - request_params_dict = { - p: v[0] for p, v in parse_qs(parse_result.query).items() - } - req = FindRequest(**request_params_dict) - resp = FindResponse(**find_for(resource, req, mdb)) - break - components = [ - "NOTE: This method is yet to be implemented! Only the first stage is run!" - ] + components - return PipelineFindResponse( - meta=merge(resp.meta, {"description": description, "components": components}), - results=resp.results, - ) - - def persist_content_and_get_drs_object( content: str, description: str, @@ -440,6 +456,9 @@ def persist_content_and_get_drs_object( id_ns="json-metadata-in", exists_ok=False, ): + r""" + TODO: Document this function. + """ mdb = get_mongo_db() drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0")) filename = filename or drs_id @@ -494,6 +513,9 @@ def _create_object( self_uri, exists_ok=False, ): + r""" + TODO: Document this function. + """ drs_obj = DrsObject( **object_in.model_dump(exclude_unset=True), id=drs_id, @@ -530,6 +552,9 @@ def _create_object( def _claim_job(job_id: str, mdb: MongoDatabase, site: Site): + r""" + TODO: Document this function. + """ job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id})) job = Job(**job_doc) # check that site satisfies the job's workflow's required capabilities. @@ -586,6 +611,9 @@ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site): @lru_cache def nmdc_workflow_id_to_dagster_job_name_map(): + r""" + TODO: Document this function and change its name to a verb. + """ return { "metadata-in-1.0.0": "apply_metadata_in", "export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata", @@ -600,6 +628,9 @@ def ensure_run_config_data( mdb: MongoDatabase, user: User, ): + r""" + TODO: Document this function and say what it "ensures" about the "run config data". + """ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0": run_config_data = assoc_in( run_config_data, @@ -629,6 +660,9 @@ def ensure_run_config_data( def inputs_for(nmdc_workflow_id, run_config_data): + r""" + TODO: Document this function. + """ if nmdc_workflow_id == "metadata-in-1.0.0": return [ "/objects/" @@ -661,6 +695,9 @@ def _request_dagster_run( repository_location_name=None, repository_name=None, ): + r""" + TODO: Document this function. + """ dagster_job_name = nmdc_workflow_id_to_dagster_job_name_map()[nmdc_workflow_id] extra_run_config_data = ensure_run_config_data( @@ -707,6 +744,9 @@ def _request_dagster_run( def _get_dagster_run_status(run_id: str): + r""" + TODO: Document this function. + """ dagster_client = get_dagster_graphql_client() try: run_status: DagsterRunStatus = dagster_client.get_run_status(run_id) @@ -716,6 +756,9 @@ def _get_dagster_run_status(run_id: str): def permitted(username: str, action: str): + r""" + TODO: Document this function and change its name to a verb. + """ db: MongoDatabase = get_mongo_db() filter_ = {"username": username, "action": action} denied = db["_runtime.api.deny"].find_one(filter_) is not None @@ -724,5 +767,8 @@ def permitted(username: str, action: str): def users_allowed(action: str): + r""" + TODO: Document this function and change its name to a verb. + """ db: MongoDatabase = get_mongo_db() return db["_runtime.api.allow"].distinct("username", {"action": action}) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 67c6379c..36b902eb 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -235,49 +235,17 @@ parameters for the __metadata__ endpoints that do not have a red ___* required___ next to them are optional.
Unlike the compact syntax used in the __find__ endpoints, the syntax for the filter parameter of the metadata endpoints -uses [MongoDB-like language querying](https://www.mongodb.com/docs/manual/tutorial/query-documents/). -The applicable parameters of the __metadata__ endpoints, with acceptable syntax and examples, are in the table below. - -| Parameter | Description | Syntax | Example | -| :---: | :-----------: | :-------: | :---: | -| collection_name | The name of the collection to be queried. For a list of collection names please see the [Database class](https://w3id.org/nmdc/Database/) of the NMDC Schema | String | `biosample_set` | -| filter | Allows conditions to be set as part of the query, returning only results that satisfy the conditions | [MongoDB-like query language](https://www.mongodb.com/docs/manual/tutorial/query-documents/). All strings should be in double quotation marks. | `{"lat_lon.latitude": {"$gt": 45.0}, "ecosystem_category": "Plants"}` | -| max_page_size | Specifies the maximum number of documents returned at a time | Integer | `25` -| page_token | Specifies the token of the page to return. If unspecified, the first page is returned. To retrieve a subsequent page, the value received as the `next_page_token` from the bottom of the previous results can be provided as a `page_token`. | String | `nmdc:sys0ae1sh583` -| projection | Indicates the desired attributes to be included in the response. Helpful for trimming down the returned results | Comma-separated list of attributes that belong to the documents in the collection being queried | `name, ecosystem_type` | -| doc_id | The unique identifier of the item being requested. For example, the identifier of a biosample or an extraction | Curie e.g. `prefix:identifier` | `nmdc:bsm-11-ha3vfb58` |
-
- +uses [MongoDB-like language querying](https://www.mongodb.com/docs/manual/tutorial/query-documents/). """, }, { "name": "find", "description": """ -The [find endpoints](https://api.microbiomedata.org/docs#/find:~:text=Find%20NMDC-,metadata,-entities.) are provided with NMDC metadata entities already specified - where metadata about [studies](https://w3id.org/nmdc/Study), [biosamples](https://w3id.org/nmdc/Biosample), [data objects](https://w3id.org/nmdc/DataObject/), and [planned processes](https://w3id.org/nmdc/PlannedProcess/) can be retrieved using GET requests. +The [find endpoints](https://api.microbiomedata.org/docs#/find) are provided with NMDC metadata entities already specified - where metadata about [studies](https://w3id.org/nmdc/Study), [biosamples](https://w3id.org/nmdc/Biosample), [data objects](https://w3id.org/nmdc/DataObject/), and [planned processes](https://w3id.org/nmdc/PlannedProcess/) can be retrieved using GET requests.
Each endpoint is unique and requires the applicable attribute names to be known in order to structure a query in a meaningful way. -Please note that endpoints with parameters that do not have a red ___* required___ label next to them are optional.
- -The applicable parameters of the ___find___ endpoints, with acceptable syntax and examples, are in the table below. - -| Parameter | Description | Syntax | Example | -| :---: | :-----------: | :-------: | :---: | -| filter | Allows conditions to be set as part of the query, returning only results that satisfy the conditions | Comma separated string of attribute:value pairs. Can include comparison operators like >=, <=, <, and >. May use a `.search` after the attribute name to conduct a full text search of the field that are of type string. e.g. `attribute:value,attribute.search:value` | `ecosystem_category:Plants, lat_lon.latitude:>35.0` | -| search | Not yet implemented | Coming Soon | Not yet implemented | -| sort | Specifies the order in which the query returns the matching documents | Comma separated string of attribute:value pairs, where the value can be empty, `asc`, or `desc` (for ascending or descending order) e.g. `attribute` or `attribute:asc` or `attribute:desc`| `depth.has_numeric_value:desc, ecosystem_type` | -| page | Specifies the desired page number among the paginated results | Integer | `3` | -| per_page | Specifies the number of results returned per page. Maximum allowed is 2,000 | Integer | `50` | -| cursor | A bookmark for where a query can pick up where it has left off. To use cursor paging, set the `cursor` parameter to `*`. The results will include a `next_cursor` value in the response's `meta` object that can be used in the `cursor` parameter to retrieve the subsequent results ![next_cursor](../_static/images/howto_guides/api_gui/find_cursor.png) | String | `*` or `nmdc:sys0zr0fbt71` | -| group_by | Not yet implemented | Coming Soon | Not yet implemented | -| fields | Indicates the desired attributes to be included in the response. Helpful for trimming down the returned results | Comma-separated list of attributes that belong to the documents in the collection being queried | `name, ess_dive_datasets` | -| study_id | The unique identifier of a study | Curie e.g. `prefix:identifier` | `nmdc:sty-11-34xj1150` | -| sample_id | The unique identifier of a biosample | Curie e.g. `prefix:identifier` | `nmdc:bsm-11-w43vsm21` | -| data_object_id | The unique identifier of a data object | Curie e.g. `prefix:identifier` | `nmdc:dobj-11-7c6np651` | -| planned_process_id | The unique identifier for an NMDC planned process | Curie e.g. `prefix:identifier` | `nmdc:wfmgan-11-hvcnga50.1`| - -
- +Parameters that do not have a red ___* required___ label next to them are optional. """, }, { diff --git a/nmdc_runtime/api/models/util.py b/nmdc_runtime/api/models/util.py index ee277c73..bd73271d 100644 --- a/nmdc_runtime/api/models/util.py +++ b/nmdc_runtime/api/models/util.py @@ -1,10 +1,13 @@ from typing import TypeVar, List, Optional, Generic, Annotated -from fastapi import Query - from pydantic import model_validator, Field, BaseModel from typing_extensions import Annotated +# TODO: Document rationale for importing `Annotated` from one package versus the other, and remove the obsolete import. +# As of Python 3.9, I think there is no difference. That's because the `typing_extensions` documentation says the +# following about its `Annotated` thing: "See typing.Annotated and PEP 593. In typing since 3.9." +# Reference: https://typing-extensions.readthedocs.io/en/stable/#typing_extensions.Annotated + ResultT = TypeVar("ResultT") @@ -14,46 +17,154 @@ class ListResponse(BaseModel, Generic[ResultT]): class ListRequest(BaseModel): - filter: Annotated[ - Optional[str], - Query( - description='MongoDB-style JSON filter document. Example: `{"ecosystem_type": "Freshwater"}`' - ), - ] = None - max_page_size: Optional[int] = 20 - page_token: Optional[str] = None - projection: Annotated[ - Optional[str], - Query( - description=( - "for MongoDB-like " - "[projection](https://www.mongodb.com/docs/manual/tutorial/project-fields-from-query-results/): " - "comma-separated list of fields you want the objects in the response to include. " - "Note: `id` will always be included. " - "Example: `ecosystem_type,name`" - ) - ), - ] = None + r""" + An encapsulation of a set of parameters accepted by API endpoints related to listing things. + + Note: This class was documented after the `FindRequest` class was documented. You can refer to the documentation of + the latter class for additional context about the usage of Pydantic's `Field` constructor in this class. + """ + + filter: Optional[str] = Field( + default=None, + title="Filter", + description="""The criteria by which you want to filter the resources, in the same format as the [`query` + parameter](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query) + of MongoDB's `db.collection.find()` method.\n\n_Example:_ + `{"lat_lon.latitude": {"$gt": 45.0}, "ecosystem_category": "Plants"}`""", + examples=[ + r'{"ecosystem_type": "Freshwater"}', + r'{"lat_lon.latitude": {"$gt": 45.0}, "ecosystem_category": "Plants"}', + ], + ) + # TODO: Document why the optional type here is `int` as opposed to `PerPageRange` (`FindRequest` uses the latter). + max_page_size: Optional[int] = Field( + default=20, + title="Resources per page", + description="How many resources you want _each page_ to contain, formatted as a positive integer.", + examples=[20], + ) + page_token: Optional[str] = Field( + default=None, + title="Next page token", + description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the + `next_page_token` field in a previous response from this endpoint.\n\n_Example_: + `nmdc:sys0zr0fbt71`""", + examples=[ + "nmdc:sys0zr0fbt71", + ], + ) + # TODO: Document the endpoint's behavior when a projection includes a _nested_ field identifier (i.e. `foo.bar`), + # and ensure the endpoint doesn't break when the projection includes field descriptors that contain commas. + projection: Optional[str] = Field( + default=None, + title="Projection", + description="""Comma-delimited list of the names of the fields you want the resources in the response to + include. Note: In addition to those fields, the response will also include the `id` + field.\n\n_Example_: `name, ecosystem_type`""", + examples=[ + "name, ecosystem_type", + ], + ) PerPageRange = Annotated[int, Field(gt=0, le=2_000)] class FindRequest(BaseModel): - filter: Optional[str] = None - search: Optional[str] = None - sort: Optional[str] = None - page: Optional[int] = None - per_page: Optional[PerPageRange] = 25 - cursor: Optional[str] = None - group_by: Optional[str] = None - fields: Annotated[ - Optional[str], - Query( - description="comma-separated list of fields you want the objects in the response to include" - ), - ] = None - + r""" + An encapsulation of a set of parameters accepted by API endpoints related to finding things. + + Notes: + - The "Query Parameter Models" section of the FastAPI docs says that this way of encapsulating + a set of query parameter definitions in a Pydantic model — so that Swagger UI displays a given + parameter's _description_ — was introduced in FastAPI 0.115.0. + Reference: https://fastapi.tiangolo.com/tutorial/query-param-models/ + - While Swagger UI does show the parameter's _description_, specifically, it does not currently show the + parameter's _title_ or example value(s). The approach shown in the "Classes as Dependencies" section + of the FastAPI docs (i.e. https://fastapi.tiangolo.com/tutorial/dependencies/classes-as-dependencies/) + does result in Swagger UI showing those additional things, but the approach involves not inheriting + from Pydantic's `BaseModel` class and involves defining an `__init__` method for the class. That is + further than I want to take these classes from their existing selves at this point. To compensate + for that, I have included examples _within_ some of the descriptions. + Reference: https://github.com/fastapi/fastapi/issues/318#issuecomment-507043221 + - The "Fields" section of the Pydantic docs says: + > "The `Field` function is used to customize and add metadata to fields of models." + References: https://docs.pydantic.dev/latest/concepts/fields/ + """ + + filter: Optional[str] = Field( + default=None, + title="Filter", + description="""The criteria by which you want to filter the resources, formatted as a comma-separated list of + `attribute:value` pairs. The `value` can include a comparison operator (e.g. `>=`). If the attribute + is of type _string_ and you append `.search` to its name, the server will perform a full-text + search.\n\n_Example:_ `ecosystem_category:Plants, lat_lon.latitude:>35.0`""", + examples=[ + "ecosystem_category:Plants", + "ecosystem_category:Plants, lat_lon.latitude:>35.0", + ], + ) + search: Optional[str] = Field( + default=None, + title="Search", + description="N/A _(not implemented yet)_", + ) + sort: Optional[str] = Field( + default=None, + title="Sort", + description="""How you want the resources to be ordered in the response, formatted as a comma-separated list of + `attribute:value` pairs. Each `attribute` is the name of a field you want the resources to be + ordered by, and each `value` is the direction you want the values in that field to be ordered + (i.e. `asc` or no value for _ascending_ order, and `desc` for _descending_ order).\n\n_Example:_ + `depth.has_numeric_value:desc, ecosystem_type`""", + examples=[ + "depth.has_numeric_value:desc", + "depth.has_numeric_value:desc, ecosystem_type", + ], + ) + page: Optional[int] = Field( + default=None, + title="Page number", + description="""_Which page_ of resources you want to retrieve, when using page number-based pagination. + This is the page number formatted as an integer ≥ 1.""", + examples=[1], + ) + per_page: Optional[PerPageRange] = Field( + default=25, + title="Resources per page", + description="How many resources you want _each page_ to contain, formatted as a positive integer ≤ 2000.", + examples=[25], + ) + cursor: Optional[str] = Field( + default=None, + title="Cursor", + description="""A bookmark you can use to fetch the _next_ page of resources, when using cursor-based pagination. + To use cursor-based pagination, set the `cursor` parameter to `*`. The response's `meta` object will + include a `next_cursor` field, whose value can be used as the `cursor` parameter in a subsequent + request.\n\n_Example_: `nmdc:sys0zr0fbt71`""", + examples=[ + "*", + "nmdc:sys0zr0fbt71", + ], + ) + group_by: Optional[str] = Field( + default=None, + title="Group by", + description="N/A _(not implemented yet)_", + ) + fields: Optional[str] = Field( + default=None, + title="Fields", + description="""The fields you want the resources to include in the response, formatted as a comma-separated list + of field names. This can be used to reduce the size and complexity of the response.\n\n_Example:_ + `name, ess_dive_datasets`""", + examples=[ + "name", + "name, ess_dive_datasets", + ], + ) + + # Reference: https://docs.pydantic.dev/latest/concepts/validators/#model-validators @model_validator(mode="before") def set_page_if_cursor_unset(cls, values): page, cursor = values.get("page"), values.get("cursor") @@ -64,22 +175,12 @@ def set_page_if_cursor_unset(cls, values): return values -class PipelineFindRequest(BaseModel): - pipeline_spec: str - description: str - - class FindResponse(BaseModel): meta: dict results: List[dict] group_by: List[dict] -class PipelineFindResponse(BaseModel): - meta: dict - results: List[dict] - - # Note: For MongoDB, a single collection can have no more than 64 indexes # Note: Each collection has a unique index set on "id" elsewhere. entity_attributes_to_index = { diff --git a/nmdc_runtime/templates/pipeline_search.html b/nmdc_runtime/templates/pipeline_search.html deleted file mode 100644 index d12424a3..00000000 --- a/nmdc_runtime/templates/pipeline_search.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - NMDC Runtime Search API - - - -

NMDC Runtime Pipeline-Search Form

-

Examples below

- - -
- - - -
- -

Examples

-

- - /biosamples?depth.has_numeric_value:>100,depth.has_unit:meter - - | part_of->id - - /studies - -

- - \ No newline at end of file diff --git a/requirements/main.in b/requirements/main.in index dcdc49c5..9758b641 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -9,7 +9,7 @@ dagster-graphql dagster-postgres dependency-injector dotted-dict -fastapi>=0.104.1 # Pins Swagger UI version to 5.9.0 temporarily to handle a bug crashing it in 5.9.1 +fastapi>=0.115.0 # note: FastAPI 0.115.0 introduced support for encapsulating request _query_ parameters in Pydantic models, including Swagger annotations fastjsonschema fnc frozendict