From 21393c5815fd29abfcefd8eb99d174e6317ad0e0 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 2 Nov 2023 16:01:58 -0400 Subject: [PATCH 01/28] fix: de-duplicate metadata submission after one minute (#347) re-submission of "same" changes is a valid use case closes #340 --- nmdc_runtime/api/endpoints/util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index d1c233b3..04d16b03 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -2,12 +2,14 @@ import os import re import tempfile +from datetime import datetime from functools import lru_cache from json import JSONDecodeError from pathlib import Path from time import time_ns from typing import List, Optional, Set, Tuple from urllib.parse import parse_qs, urlparse +from zoneinfo import ZoneInfo from bson import json_util from dagster import DagsterRunStatus @@ -433,7 +435,11 @@ def persist_content_and_get_drs_object( **drs_metadata_for( filepath, base={ - "description": description + f" (created by/for {username})", + "description": ( + description + + f" (created by/for {username}" + + f" at {datetime.now(tz=ZoneInfo('America/Los_Angeles')).isoformat(timespec='minutes')})" + ), "access_methods": [{"access_id": drs_id}], }, ) From 2c2a4dd7d55714761253cdde4cb9a9ef5ba823ba Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 2 Nov 2023 16:03:41 -0400 Subject: [PATCH 02/28] fix: ensure pydantic models serialize to json-compatible `dict`s. (#346) add regression test closes #345 --- .../workflow_execution_activity/core.py | 4 +- nmdc_runtime/api/core/util.py | 2 +- nmdc_runtime/api/endpoints/objects.py | 6 +- nmdc_runtime/api/endpoints/operations.py | 8 ++- nmdc_runtime/api/endpoints/queries.py | 10 +-- nmdc_runtime/api/endpoints/runs.py | 6 +- nmdc_runtime/api/endpoints/search.py | 4 +- nmdc_runtime/api/endpoints/sites.py | 12 +++- nmdc_runtime/api/endpoints/users.py | 22 +++++-- nmdc_runtime/api/endpoints/util.py | 20 ++++-- nmdc_runtime/api/main.py | 10 ++- nmdc_runtime/api/models/run.py | 18 +++-- nmdc_runtime/core/exceptions/token.py | 2 +- nmdc_runtime/minter/adapters/repository.py | 13 +++- .../minter/entrypoints/fastapi_app.py | 8 ++- nmdc_runtime/site/drsobjects/ingest.py | 6 +- nmdc_runtime/site/ops.py | 15 ++++- nmdc_runtime/site/repository.py | 4 +- nmdc_runtime/site/resources.py | 4 +- tests/files/nmdc_bsm-12-7mysck21.json | 48 ++++++++++++++ tests/files/nmdc_sty-11-pzmd0x14.json | 65 +++++++++++++++++++ tests/integration/test_minter_repository.py | 50 ++++++++++++-- tests/test_api/test_endpoints.py | 58 +++++++++++++++-- tests/test_api/test_metadata.py | 4 ++ util/mongorestore-nmdc.sh | 2 +- 25 files changed, 346 insertions(+), 55 deletions(-) create mode 100644 tests/files/nmdc_bsm-12-7mysck21.json create mode 100644 tests/files/nmdc_sty-11-pzmd0x14.json diff --git a/components/nmdc_runtime/workflow_execution_activity/core.py b/components/nmdc_runtime/workflow_execution_activity/core.py index 8a550e80..ad7d020d 100644 --- a/components/nmdc_runtime/workflow_execution_activity/core.py +++ b/components/nmdc_runtime/workflow_execution_activity/core.py @@ -94,7 +94,9 @@ def insert_into_keys( workflow: Workflow, data_objects: list[DataObject] ) -> dict[str, Any]: """Insert data object url into correct workflow input field.""" - workflow_dict = workflow.dict() + workflow_dict = workflow.model_dump( + mode="json", + ) for key in workflow_dict["inputs"]: for do in data_objects: if workflow_dict["inputs"][key] == str(do.data_object_type): diff --git a/nmdc_runtime/api/core/util.py b/nmdc_runtime/api/core/util.py index 5aad716d..48d15ed1 100644 --- a/nmdc_runtime/api/core/util.py +++ b/nmdc_runtime/api/core/util.py @@ -98,6 +98,6 @@ def generate_secret(length=12): def json_clean(data, model, exclude_unset=False) -> dict: """Run data through a JSON serializer for a pydantic model.""" if not isinstance(data, (dict, BaseModel)): - raise TypeError("`data` must be a pydantic model or its .dict()") + raise TypeError("`data` must be a pydantic model or its .model_dump()") m = model(**data) if isinstance(data, dict) else data return json.loads(m.json(exclude_unset=exclude_unset)) diff --git a/nmdc_runtime/api/endpoints/objects.py b/nmdc_runtime/api/endpoints/objects.py index 13cfa3aa..0dd3b443 100644 --- a/nmdc_runtime/api/endpoints/objects.py +++ b/nmdc_runtime/api/endpoints/objects.py @@ -78,7 +78,7 @@ def create_object( """ id_supplied = supplied_object_id( - mdb, client_site, object_in.dict(exclude_unset=True) + mdb, client_site, object_in.model_dump(mode="json", exclude_unset=True) ) drs_id = local_part( id_supplied if id_supplied is not None else generate_one_id(mdb, S3_ID_NS) @@ -255,7 +255,9 @@ def update_object( status_code=status.HTTP_403_FORBIDDEN, detail=f"client authorized for different site_id than {object_mgr_site}", ) - doc_object_patched = merge(doc, object_patch.dict(exclude_unset=True)) + doc_object_patched = merge( + doc, object_patch.model_dump(mode="json", exclude_unset=True) + ) mdb.operations.replace_one({"id": object_id}, doc_object_patched) return doc_object_patched diff --git a/nmdc_runtime/api/endpoints/operations.py b/nmdc_runtime/api/endpoints/operations.py index 1f1f67b3..ecb4d33e 100644 --- a/nmdc_runtime/api/endpoints/operations.py +++ b/nmdc_runtime/api/endpoints/operations.py @@ -61,12 +61,16 @@ def update_operation( detail=f"client authorized for different site_id than {site_id_op}", ) op_patch_metadata = merge( - op_patch.dict(exclude_unset=True).get("metadata", {}), + op_patch.model_dump(mode="json", exclude_unset=True).get("metadata", {}), pick(["site_id", "job", "model"], doc_op.get("metadata", {})), ) doc_op_patched = merge( doc_op, - assoc(op_patch.dict(exclude_unset=True), "metadata", op_patch_metadata), + assoc( + op_patch.model_dump(mode="json", exclude_unset=True), + "metadata", + op_patch_metadata, + ), ) mdb.operations.replace_one({"id": op_id}, doc_op_patched) return doc_op_patched diff --git a/nmdc_runtime/api/endpoints/queries.py b/nmdc_runtime/api/endpoints/queries.py index 4e1c49c9..11417698 100644 --- a/nmdc_runtime/api/endpoints/queries.py +++ b/nmdc_runtime/api/endpoints/queries.py @@ -75,9 +75,9 @@ def run_query( id=qid, saved_at=saved_at, ) - mdb.queries.insert_one(query.dict(exclude_unset=True)) + mdb.queries.insert_one(query.model_dump(mode="json", exclude_unset=True)) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.dict(exclude_unset=True)) + return unmongo(cmd_response.model_dump(mode="json", exclude_unset=True)) @router.get("/queries/{query_id}", response_model=Query) @@ -107,7 +107,7 @@ def rerun_query( check_can_delete(user) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.dict(exclude_unset=True)) + return unmongo(cmd_response.model_dump(mode="json", exclude_unset=True)) def _run_query(query, mdb) -> CommandResponse: @@ -131,12 +131,12 @@ def _run_query(query, mdb) -> CommandResponse: detail="Failed to back up to-be-deleted documents. operation aborted.", ) - q_response = mdb.command(query.cmd.dict(exclude_unset=True)) + q_response = mdb.command(query.cmd.model_dump(mode="json", exclude_unset=True)) cmd_response: CommandResponse = command_response_for(q_type)(**q_response) query_run = ( QueryRun(qid=query.id, ran_at=ran_at, result=cmd_response) if cmd_response.ok else QueryRun(qid=query.id, ran_at=ran_at, error=cmd_response) ) - mdb.query_runs.insert_one(query_run.dict(exclude_unset=True)) + mdb.query_runs.insert_one(query_run.model_dump(mode="json", exclude_unset=True)) return cmd_response diff --git a/nmdc_runtime/api/endpoints/runs.py b/nmdc_runtime/api/endpoints/runs.py index c49b7800..8bd9f22d 100644 --- a/nmdc_runtime/api/endpoints/runs.py +++ b/nmdc_runtime/api/endpoints/runs.py @@ -94,5 +94,9 @@ def post_run_event( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Supplied run_event.run.id does not match run_id given in request URL.", ) - mdb.run_events.insert_one(run_event.dict()) + mdb.run_events.insert_one( + run_event.model_dump( + mode="json", + ) + ) return _get_run_summary(run_event.run.id, mdb) diff --git a/nmdc_runtime/api/endpoints/search.py b/nmdc_runtime/api/endpoints/search.py index 5fe80d2c..4813f7d2 100644 --- a/nmdc_runtime/api/endpoints/search.py +++ b/nmdc_runtime/api/endpoints/search.py @@ -25,7 +25,9 @@ def data_objects( req: DataObjectListRequest = Depends(), mdb: MongoDatabase = Depends(get_mongo_db), ): - filter_ = list_request_filter_to_mongo_filter(req.dict(exclude_unset=True)) + filter_ = list_request_filter_to_mongo_filter( + req.model_dump(mode="json", exclude_unset=True) + ) max_page_size = filter_.pop("max_page_size", None) page_token = filter_.pop("page_token", None) req = ListRequest( diff --git a/nmdc_runtime/api/endpoints/sites.py b/nmdc_runtime/api/endpoints/sites.py index f63fd993..76adfdc1 100644 --- a/nmdc_runtime/api/endpoints/sites.py +++ b/nmdc_runtime/api/endpoints/sites.py @@ -56,7 +56,11 @@ def create_site( status_code=status.HTTP_409_CONFLICT, detail=f"site with supplied id {site.id} already exists", ) - mdb.sites.insert_one(site.dict()) + mdb.sites.insert_one( + site.model_dump( + mode="json", + ) + ) refresh_minter_requesters_from_sites() rv = mdb.users.update_one( {"username": user.username}, @@ -165,7 +169,11 @@ def put_object_in_site( }, } ) - mdb.operations.insert_one(op.dict()) + mdb.operations.insert_one( + op.model_dump( + mode="json", + ) + ) return op diff --git a/nmdc_runtime/api/endpoints/users.py b/nmdc_runtime/api/endpoints/users.py index 601a5be8..587ad453 100644 --- a/nmdc_runtime/api/endpoints/users.py +++ b/nmdc_runtime/api/endpoints/users.py @@ -35,7 +35,11 @@ async def login_for_access_token( detail="Incorrect username or password", headers={"WWW-Authenticate": "Bearer"}, ) - access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.dict()) + access_token_expires = timedelta( + **ACCESS_TOKEN_EXPIRES.model_dump( + mode="json", + ) + ) access_token = create_access_token( data={"sub": f"user:{user.username}"}, expires_delta=access_token_expires ) @@ -50,7 +54,11 @@ async def login_for_access_token( headers={"WWW-Authenticate": "Bearer"}, ) # TODO make below an absolute time - access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.dict()) + access_token_expires = timedelta( + **ACCESS_TOKEN_EXPIRES.model_dump( + mode="json", + ) + ) access_token = create_access_token( data={"sub": f"client:{form_data.client_id}"}, expires_delta=access_token_expires, @@ -58,7 +66,9 @@ async def login_for_access_token( return { "access_token": access_token, "token_type": "bearer", - "expires": ACCESS_TOKEN_EXPIRES.dict(), + "expires": ACCESS_TOKEN_EXPIRES.model_dump( + mode="json", + ), } @@ -84,8 +94,10 @@ def create_user( check_can_create_user(requester) mdb.users.insert_one( UserInDB( - **user_in.dict(), + **user_in.model_dump( + mode="json", + ), hashed_password=get_password_hash(user_in.password), - ).dict(exclude_unset=True) + ).model_dump(mode="json", exclude_unset=True) ) return mdb.users.find_one({"username": user_in.username}) diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 04d16b03..9b228bda 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -454,9 +454,11 @@ def _create_object( mdb: MongoDatabase, object_in: DrsObjectIn, mgr_site, drs_id, self_uri ): drs_obj = DrsObject( - **object_in.dict(exclude_unset=True), id=drs_id, self_uri=self_uri + **object_in.model_dump(exclude_unset=True, mode="json"), + id=drs_id, + self_uri=self_uri, ) - doc = drs_obj.dict(exclude_unset=True) + doc = drs_obj.model_dump(exclude_unset=True, mode="json") doc["_mgr_site"] = mgr_site # manager site try: mdb.objects.insert_one(doc) @@ -517,16 +519,22 @@ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site): "workflow": job.workflow, "config": job.config, } - ).dict(exclude_unset=True), + ).model_dump(mode="json", exclude_unset=True), "site_id": site.id, "model": dotted_path_for(JobOperationMetadata), }, } ) - mdb.operations.insert_one(op.dict()) - mdb.jobs.replace_one({"id": job.id}, job.dict(exclude_unset=True)) + mdb.operations.insert_one( + op.model_dump( + mode="json", + ) + ) + mdb.jobs.replace_one( + {"id": job.id}, job.model_dump(mode="json", exclude_unset=True) + ) - return op.dict(exclude_unset=True) + return op.model_dump(mode="json", exclude_unset=True) @lru_cache diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 5f4d799a..ca10adcc 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -232,7 +232,9 @@ def ensure_initial_resources_on_boot(): collection_boot = import_module(f"nmdc_runtime.api.boot.{collection_name}") for model in collection_boot.construct(): - doc = model.dict() + doc = model.model_dump( + mode="json", + ) mdb[collection_name].replace_one({"id": doc["id"]}, doc, upsert=True) username = os.getenv("API_ADMIN_USER") @@ -244,7 +246,7 @@ def ensure_initial_resources_on_boot(): username=username, hashed_password=get_password_hash(os.getenv("API_ADMIN_PASS")), site_admin=[os.getenv("API_SITE_ID")], - ).dict(exclude_unset=True), + ).model_dump(mode="json", exclude_unset=True), upsert=True, ) mdb.users.create_index("username") @@ -265,7 +267,9 @@ def ensure_initial_resources_on_boot(): ), ) ], - ).dict(), + ).model_dump( + mode="json", + ), upsert=True, ) diff --git a/nmdc_runtime/api/models/run.py b/nmdc_runtime/api/models/run.py index 49fa37ba..43bf734e 100644 --- a/nmdc_runtime/api/models/run.py +++ b/nmdc_runtime/api/models/run.py @@ -93,7 +93,11 @@ def _add_run_requested_event(run_spec: RunUserSpec, mdb: MongoDatabase, user: Us time=now(as_str=True), inputs=run_spec.inputs, ) - mdb.run_events.insert_one(event.dict()) + mdb.run_events.insert_one( + event.model_dump( + mode="json", + ) + ) return run_id @@ -113,7 +117,9 @@ def _add_run_started_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.STARTED, time=now(as_str=True), - ).dict() + ).model_dump( + mode="json", + ) ) return run_id @@ -134,7 +140,9 @@ def _add_run_fail_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.FAIL, time=now(as_str=True), - ).dict() + ).model_dump( + mode="json", + ) ) return run_id @@ -156,6 +164,8 @@ def _add_run_complete_event(run_id: str, mdb: MongoDatabase, outputs: List[str]) type=RunEventType.COMPLETE, time=now(as_str=True), outputs=outputs, - ).dict() + ).model_dump( + mode="json", + ) ) return run_id diff --git a/nmdc_runtime/core/exceptions/token.py b/nmdc_runtime/core/exceptions/token.py index c5e9b1c3..afe00871 100644 --- a/nmdc_runtime/core/exceptions/token.py +++ b/nmdc_runtime/core/exceptions/token.py @@ -1,4 +1,4 @@ -from core.exceptions import CustomException +from nmdc_runtime.core.exceptions import CustomException class DecodeTokenException(CustomException): diff --git a/nmdc_runtime/minter/adapters/repository.py b/nmdc_runtime/minter/adapters/repository.py index 25382731..879bdcc8 100644 --- a/nmdc_runtime/minter/adapters/repository.py +++ b/nmdc_runtime/minter/adapters/repository.py @@ -97,7 +97,9 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) ) for id_ in ids: - self.db[id_.id] = id_.dict() + self.db[id_.id] = id_.model_dump( + mode="json", + ) return ids def bind(self, req_bind: BindingRequest) -> Identifier: @@ -184,7 +186,14 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) for id_name in not_taken ] - self.db["minter.id_records"].insert_many([i.dict() for i in ids]) + self.db["minter.id_records"].insert_many( + [ + i.model_dump( + mode="json", + ) + for i in ids + ] + ) collected.extend(ids) if len(collected) == req_mint.how_many: break diff --git a/nmdc_runtime/minter/entrypoints/fastapi_app.py b/nmdc_runtime/minter/entrypoints/fastapi_app.py index d0ac097f..3d4b7efc 100644 --- a/nmdc_runtime/minter/entrypoints/fastapi_app.py +++ b/nmdc_runtime/minter/entrypoints/fastapi_app.py @@ -37,7 +37,13 @@ def mint_ids( requester = Entity(id=site.id) try: minted = s.mint( - MintingRequest(service=service, requester=requester, **req_mint.dict()) + MintingRequest( + service=service, + requester=requester, + **req_mint.model_dump( + mode="json", + ), + ) ) return [d.id for d in minted] except MinterError as e: diff --git a/nmdc_runtime/site/drsobjects/ingest.py b/nmdc_runtime/site/drsobjects/ingest.py index b4b7dc38..26a26f43 100644 --- a/nmdc_runtime/site/drsobjects/ingest.py +++ b/nmdc_runtime/site/drsobjects/ingest.py @@ -44,7 +44,11 @@ def claim_metadata_ingest_jobs( ) jobs = [] while True: - rv = client.list_jobs(lr.dict()).json() + rv = client.list_jobs( + lr.model_dump( + mode="json", + ) + ).json() jobs.extend(rv["resources"]) if "next_page_token" not in rv: break diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 828fe4e8..265498ab 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -267,7 +267,11 @@ def get_operation(context): def produce_curated_db(context, op: Operation): client: RuntimeApiSiteClient = context.resources.runtime_api_site_client mdb: MongoDatabase = context.resources.mongo.db - op = Operation[ResultT, JobOperationMetadata](**op.dict()) + op = Operation[ResultT, JobOperationMetadata]( + **op.model_dump( + mode="json", + ) + ) op_meta: JobOperationMetadata = op.metadata job_id = op_meta.job.id job = mdb.jobs.find_one({"id": job_id}) @@ -350,7 +354,12 @@ def filter_ops_undone_expired() -> str: @op(required_resource_keys={"runtime_api_site_client"}) def list_operations(context, filter_: str) -> list: client = context.resources.runtime_api_site_client - ops = [op.dict() for op in client.list_operations({"filter": filter_})] + ops = [ + op.model_dump( + mode="json", + ) + for op in client.list_operations({"filter": filter_}) + ] context.log.info(str(len(ops))) return ops @@ -466,7 +475,7 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn): op = Operation(**mdb.operations.find_one({"id": op_id})) op.done = True op.result = {"update_cmd": json.dumps(update_cmd)} - op_doc = op.dict(exclude_unset=True) + op_doc = op.model_dump(mode="json", exclude_unset=True) mdb.operations.replace_one({"id": op_id}, op_doc) return ["/operations/" + op_doc["id"]] diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 44270249..22b9a9c0 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -405,7 +405,9 @@ def claim_and_run_apply_changesheet_jobs(_context): def done_object_put_ops(_context): client = get_runtime_api_site_client(run_config_frozen__normal_env) ops = [ - op.dict() + op.model_dump( + mode="json", + ) for op in client.list_operations( { "filter": json.dumps( diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 79cae368..983ab206 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -60,7 +60,9 @@ def request(self, method, url_path, params_or_json_data=None): self.ensure_token() kwargs = {"url": self.base_url + url_path, "headers": self.headers} if isinstance(params_or_json_data, BaseModel): - params_or_json_data = params_or_json_data.dict(exclude_unset=True) + params_or_json_data = params_or_json_data.model_dump( + mode="json", exclude_unset=True + ) if method.upper() == "GET": kwargs["params"] = params_or_json_data else: diff --git a/tests/files/nmdc_bsm-12-7mysck21.json b/tests/files/nmdc_bsm-12-7mysck21.json new file mode 100644 index 00000000..d0571f47 --- /dev/null +++ b/tests/files/nmdc_bsm-12-7mysck21.json @@ -0,0 +1,48 @@ +{ + "analysis_type": [ + "metagenomics" + ], + "biosample_categories": [ + "NEON" + ], + "collection_date": { + "has_raw_value": "2014-07-15T18:00Z" + }, + "depth": { + "has_maximum_numeric_value": 1, + "has_minimum_numeric_value": 0, + "has_unit": "meters" + }, + "elev": 1179.5, + "env_broad_scale": { + "term": { + "id": "ENVO:01000253", + "name": "freshwater river biome" + } + }, + "env_local_scale": { + "term": { + "id": "ENVO:03600095", + "name": "stream run" + } + }, + "env_medium": { + "term": { + "id": "ENVO:01001057", + "name": "environment associated with a plant part or small plant" + } + }, + "geo_loc_name": { + "has_raw_value": "USA: Colorado, Arikaree River" + }, + "id": "nmdc:bsm-12-7mysck21", + "lat_lon": { + "latitude": 39.758206, + "longitude": -102.447148 + }, + "name": "ARIK.20140715.AMC.EPIPHYTON.5", + "part_of": [ + "nmdc:sty-11-34xj1150" + ], + "type": "nmdc:Biosample" +} diff --git a/tests/files/nmdc_sty-11-pzmd0x14.json b/tests/files/nmdc_sty-11-pzmd0x14.json new file mode 100644 index 00000000..114437c0 --- /dev/null +++ b/tests/files/nmdc_sty-11-pzmd0x14.json @@ -0,0 +1,65 @@ +{ + "id": "nmdc:sty-11-pzmd0x14", + "name": "National Ecological Observatory Network: benthic metagenomes (DP1.20279.001)", + "type": "nmdc:Study", + "title": "National Ecological Observatory Network: benthic metagenomes (DP1.20279.001)", + "description": "The National Science Foundation's National Ecological Observatory Network (NEON) is a continental-scale observation facility operated by Battelle and designed to collect long-term open access ecological data to better understand how U.S. ecosystems are changing.", + "websites": [ + "https://www.neonscience.org/", + "https://data.neonscience.org/data-products/DP1.20279.001", + "https://data.neonscience.org/api/v0/documents/NEON_metagenomes_userGuide_vE.pdf" + ], + "funding_sources": [ + "NSF#1724433 National Ecological Observatory Network: Operations Activities" + ], + "principal_investigator": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424", + "has_raw_value": "Kate Thibault" + }, + "has_credit_associations": [ + { + "applies_to_person": { + "name": "Hugh Cross", + "email": "crossh@battelleecology.org", + "orcid": "orcid:0000-0002-6745-9479" + }, + "applied_roles": [ + "Methodology", + "Data curation" + ] + }, + { + "applies_to_person": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424" + }, + "applied_roles": [ + "Principal Investigator" + ] + }, + { + "applies_to_person": { + "name": "Stephanie Parker", + "email": "sparker@battelleecology.org", + "orcid": "0000-0002-7180-7245" + }, + "applied_roles": [ + "Methodology", + "Data curation" + ] + } + ], + "study_image": [ + { + "url": "https://portal.nersc.gov/project/m3408/profile_images/nmdc_sty-11-34xj1150.jpg" + } + ], + "gold_study_identifiers": [], + "part_of": [ + "nmdc:sty-11-nxrz9m96" + ], + "study_category": "consortium" +} diff --git a/tests/integration/test_minter_repository.py b/tests/integration/test_minter_repository.py index 96199670..45ad0b56 100644 --- a/tests/integration/test_minter_repository.py +++ b/tests/integration/test_minter_repository.py @@ -29,7 +29,12 @@ def test_mint_and_resolve(): s: InMemoryIDStore = get_test_inmemoryidstore() req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_res = ResolutionRequest(id_name=id_.name, **req_mint.dict()) + req_res = ResolutionRequest( + id_name=id_.name, + **req_mint.model_dump( + mode="json", + ), + ) assert s.resolve(req_res) is not None @@ -37,9 +42,23 @@ def test_mint_and_delete(): s: InMemoryIDStore = get_test_inmemoryidstore() req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_del = DeleteRequest(id_name=id_.name, **req_mint.dict()) + req_del = DeleteRequest( + id_name=id_.name, + **req_mint.model_dump( + mode="json", + ), + ) s.delete(req_del) - assert s.resolve(ResolutionRequest(**req_del.dict())) is None + assert ( + s.resolve( + ResolutionRequest( + **req_del.model_dump( + mode="json", + ) + ) + ) + is None + ) def test_mongo_mint_one(): @@ -70,7 +89,12 @@ def test_mongo_mint_and_resolve(): req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_res = ResolutionRequest(id_name=id_.name, **req_mint.dict()) + req_res = ResolutionRequest( + id_name=id_.name, + **req_mint.model_dump( + mode="json", + ), + ) assert s.resolve(req_res) is not None @@ -80,7 +104,21 @@ def test_mongo_mint_and_delete(): req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_del = DeleteRequest(id_name=id_.name, **req_mint.dict()) + req_del = DeleteRequest( + id_name=id_.name, + **req_mint.model_dump( + mode="json", + ), + ) s.delete(req_del) - assert s.resolve(ResolutionRequest(**req_del.dict())) is None + assert ( + s.resolve( + ResolutionRequest( + **req_del.model_dump( + mode="json", + ) + ) + ) + is None + ) assert s.db["minter.id_records"].count_documents({}) == 0 diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 1dd677cc..387154b0 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -1,4 +1,6 @@ +import json import os +import re import pytest import requests @@ -7,13 +9,17 @@ from toolz import get_in from nmdc_runtime.api.core.auth import get_password_hash +from nmdc_runtime.api.core.metadata import df_from_sheet_in, _validate_changesheet from nmdc_runtime.api.core.util import generate_secret, dotted_path_for from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object from nmdc_runtime.api.models.job import Job, JobOperationMetadata +from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.api.models.site import SiteInDB, SiteClientInDB from nmdc_runtime.api.models.user import UserInDB, UserIn, User from nmdc_runtime.site.repository import run_config_frozen__normal_env from nmdc_runtime.site.resources import get_mongo, RuntimeApiSiteClient +from nmdc_runtime.util import REPO_ROOT_DIR def ensure_test_resources(mdb): @@ -26,7 +32,7 @@ def ensure_test_resources(mdb): username=username, hashed_password=get_password_hash(password), site_admin=[site_id], - ).dict(exclude_unset=True), + ).model_dump(mode="json", exclude_unset=True), upsert=True, ) @@ -42,7 +48,9 @@ def ensure_test_resources(mdb): hashed_secret=get_password_hash(client_secret), ) ], - ).dict(), + ).model_dump( + mode="json", + ), upsert=True, ) wf_id = "test" @@ -50,7 +58,9 @@ def ensure_test_resources(mdb): prev_ops = {"metadata.job.id": job_id, "metadata.site_id": site_id} mdb.operations.delete_many(prev_ops) job = Job(**{"id": job_id, "workflow": {"id": wf_id}, "config": {}, "claims": []}) - mdb.jobs.replace_one({"id": job_id}, job.dict(exclude_unset=True), upsert=True) + mdb.jobs.replace_one( + {"id": job_id}, job.model_dump(mode="json", exclude_unset=True), upsert=True + ) return { "site_client": { "site_id": site_id, @@ -58,7 +68,7 @@ def ensure_test_resources(mdb): "client_secret": client_secret, }, "user": {"username": username, "password": password}, - "job": job.dict(exclude_unset=True), + "job": job.model_dump(mode="json", exclude_unset=True), } @@ -114,7 +124,7 @@ def get_token(): "POST", url=(base_url + "/users"), headers=headers, - json=user_in.dict(exclude_unset=True), + json=user_in.model_dump(mode="json", exclude_unset=True), ) try: @@ -181,3 +191,41 @@ def test_metadata_validate_json_with_unknown_collection(api_site_client): {"studi_set": []}, ) assert rv.json()["result"] == "errors" + + +def test_submit_changesheet(): + sheet_in = ChangesheetIn( + name="sheet", + content_type="text/tab-separated-values", + text="id\taction\tattribute\tvalue\nnmdc:bsm-12-7mysck21\tupdate\tpart_of\tnmdc:sty-11-pzmd0x14\n", + ) + mdb = get_mongo_db() + rs = ensure_test_resources(mdb) + if not mdb.biosample_set.find_one({"id": "nmdc:bsm-12-7mysck21"}): + mdb.biosample_set.insert_one( + json.loads( + ( + REPO_ROOT_DIR / "tests" / "files" / "nmdc_bsm-12-7mysck21.json" + ).read_text() + ) + ) + if not mdb.study_set.find_one({"id": "nmdc:sty-11-pzmd0x14"}): + mdb.study_set.insert_one( + json.loads( + ( + REPO_ROOT_DIR / "tests" / "files" / "nmdc_sty-11-pzmd0x14.json" + ).read_text() + ) + ) + df_change = df_from_sheet_in(sheet_in, mdb) + _ = _validate_changesheet(df_change, mdb) + drs_obj_doc = persist_content_and_get_drs_object( + content=sheet_in.text, + username=rs["user"]["username"], + filename=re.sub(r"[^A-Za-z0-9._\-]", "_", sheet_in.name), + content_type=sheet_in.content_type, + description="changesheet", + id_ns="changesheets", + ) + mdb.objects.delete_one({"id": drs_obj_doc["id"]}) + assert True diff --git a/tests/test_api/test_metadata.py b/tests/test_api/test_metadata.py index 9fc63254..82b6e70f 100644 --- a/tests/test_api/test_metadata.py +++ b/tests/test_api/test_metadata.py @@ -7,6 +7,8 @@ import pytest from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object +from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.util import get_nmdc_jsonschema_dict from toolz import dissoc @@ -15,6 +17,8 @@ update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd, + df_from_sheet_in, + _validate_changesheet, ) from nmdc_runtime.site.ops import ensure_data_object_type from nmdc_runtime.site.repository import run_config_frozen__normal_env diff --git a/util/mongorestore-nmdc.sh b/util/mongorestore-nmdc.sh index e0f5f253..aa23ccfb 100755 --- a/util/mongorestore-nmdc.sh +++ b/util/mongorestore-nmdc.sh @@ -4,4 +4,4 @@ # $ ./util/mongorestore-nmdc.sh mongorestore -h $MONGO_HOST -u $MONGO_USERNAME -p $MONGO_PASSWORD --authenticationDatabase=admin \ --gzip --drop \ - $HOME/nmdcdb-mongodump/nmdcdb/2023-05-24T11/ \ No newline at end of file + $HOME/nmdcdb-mongodump/nmdcdb/2023-11-02T11/ \ No newline at end of file From cbd58f3690bff76e4306ae9bbb2c1caec8d749c2 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 13:29:29 -0400 Subject: [PATCH 03/28] docs: add release-to-pypi info --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 103df781..675a4272 100644 --- a/README.md +++ b/README.md @@ -144,3 +144,11 @@ desired and does not break over time. [For hints on how to write tests for solids and pipelines in Dagster, see their documentation tutorial on Testing](https://docs.dagster.io/tutorial/testable). + + +## Release to PyPI + +``` +python -m build +twine upload dist/* +``` \ No newline at end of file From b033bfbd988bb764269f4a2a8ff55c3de2e43c15 Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Fri, 3 Nov 2023 10:50:32 -0700 Subject: [PATCH 04/28] Add release workflow (#351) --- .github/workflows/release.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..c388f97b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,29 @@ +name: Publish Python Package + +on: + release: + types: [created] + +jobs: + build-n-publish: + name: Build and publish Python 🐍 distributions 📦 to PyPI + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.10 + + - name: Build source and wheel archives + run: | + python -m build + + - name: Publish distribution 📦 to PyPI + if: github.repository == 'microbiomedata/nmdc-runtime' + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_PASSWORD }} From 4d484915d2d9c18bbf07b62cd4a729f3d9c83c3a Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 13:51:47 -0400 Subject: [PATCH 05/28] docs: don't upload >1 release --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 675a4272..6fc163df 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,7 @@ tutorial on Testing](https://docs.dagster.io/tutorial/testable). ## Release to PyPI ``` +rm -rf dist python -m build twine upload dist/* ``` \ No newline at end of file From f551841ff231dc1a3609937d374f7aa8bd04c979 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 13:56:22 -0400 Subject: [PATCH 06/28] Update release.yml (#352) I still hate YAML --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c388f97b..ae12349f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: '3.10' - name: Build source and wheel archives run: | From 1cbfcf89d13ec2acc808abf1ff7935cee159a683 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 13:59:29 -0400 Subject: [PATCH 07/28] fix: ensure deps for GH action --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ae12349f..3c8fec18 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,6 +19,7 @@ jobs: - name: Build source and wheel archives run: | + make init python -m build - name: Publish distribution 📦 to PyPI From 0c69b60a6bb6cae46ea976f2d74a64893117273d Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 14:18:06 -0400 Subject: [PATCH 08/28] feat: version info from runtime (#353) --- nmdc_runtime/api/main.py | 13 ++++++++++++- requirements/dev.in | 1 - requirements/dev.txt | 10 ++-------- requirements/main.in | 1 + requirements/main.txt | 38 ++++++++++++++++++++++---------------- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index ca10adcc..2a348aa1 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -3,9 +3,11 @@ from importlib import import_module from importlib.metadata import version +import fastapi import uvicorn from fastapi import APIRouter, FastAPI from fastapi.middleware.cors import CORSMiddleware +from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse @@ -342,9 +344,18 @@ async def root(): ) +@api_router.get("/version") +async def get_versions(): + return { + "nmdc-runtime": get_version(), + "fastapi": fastapi.__version__, + "nmdc-schema": version("nmdc_schema"), + } + + app = FastAPI( title="NMDC Runtime API", - version="0.2.0", + version=get_version(), description=( "The NMDC Runtime API, via on-demand functions " "and via schedule-based and sensor-based automation, " diff --git a/requirements/dev.in b/requirements/dev.in index 1b7450f2..dbe7b8e9 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -10,6 +10,5 @@ pytest-asyncio pytest-cov requests-mock setuptools -setuptools-scm twine requests-cache \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 029c5a12..4120028f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -23,7 +23,7 @@ certifi==2023.7.22 # via # -c requirements/main.txt # requests -charset-normalizer==3.3.1 +charset-normalizer==3.3.2 # via # -c requirements/main.txt # requests @@ -90,7 +90,6 @@ packaging==23.2 # black # build # pytest - # setuptools-scm pathspec==0.11.2 # via # -c requirements/main.txt @@ -127,7 +126,7 @@ pytest==7.4.3 # -r requirements/dev.in # pytest-asyncio # pytest-cov -pytest-asyncio==0.22.0 +pytest-asyncio==0.21.1 # via -r requirements/dev.in pytest-cov==4.1.0 # via -r requirements/dev.in @@ -154,8 +153,6 @@ rfc3986==2.0.0 # via twine rich==13.6.0 # via twine -setuptools-scm==8.0.4 - # via -r requirements/dev.in six==1.16.0 # via # -c requirements/main.txt @@ -170,7 +167,6 @@ tomli==2.0.1 # pip-tools # pyproject-hooks # pytest - # setuptools-scm twine==4.0.2 # via -r requirements/dev.in typing-extensions==4.8.0 @@ -178,7 +174,6 @@ typing-extensions==4.8.0 # -c requirements/main.txt # black # cattrs - # setuptools-scm url-normalize==1.4.3 # via # -c requirements/main.txt @@ -206,4 +201,3 @@ setuptools==68.2.2 # -c requirements/main.txt # -r requirements/dev.in # pip-tools - # setuptools-scm diff --git a/requirements/main.in b/requirements/main.in index 487c608e..46551ccd 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -35,6 +35,7 @@ python-multipart pyyaml requests semver +setuptools-scm tenacity terminusdb-client toolz diff --git a/requirements/main.txt b/requirements/main.txt index b2ae0d97..1ee0dc8b 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -67,9 +67,9 @@ black==23.10.1 # via shed bleach==6.1.0 # via nbconvert -boto3==1.28.74 +boto3==1.28.77 # via -r requirements/main.in -botocore==1.31.74 +botocore==1.31.77 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ chardet==5.2.0 # via # pyshex # pyshexc -charset-normalizer==3.3.1 +charset-normalizer==3.3.2 # via requests click==8.1.7 # via @@ -121,25 +121,25 @@ croniter==2.0.1 # via dagster cryptography==41.0.5 # via python-jose -curies==0.6.7 +curies==0.7.0 # via linkml-runtime -dagit==1.5.5 +dagit==1.5.6 # via -r requirements/main.in -dagster==1.5.5 +dagster==1.5.6 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.5.5 +dagster-graphql==1.5.6 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.5.5 +dagster-pipes==1.5.6 # via dagster -dagster-postgres==0.21.5 +dagster-postgres==0.21.6 # via -r requirements/main.in -dagster-webserver==1.5.5 +dagster-webserver==1.5.6 # via dagit debugpy==1.8.0 # via ipykernel @@ -355,7 +355,7 @@ jupyter-server==2.9.1 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.7 +jupyterlab==4.0.8 # via # -r requirements/main.in # notebook @@ -387,7 +387,7 @@ linkml-runtime==1.6.0 # nmdc-schema mako==1.2.4 # via alembic -markdown==3.5 +markdown==3.5.1 # via # mkdocs # mkdocs-material @@ -485,6 +485,7 @@ packaging==23.2 # pytest # qtconsole # qtpy + # setuptools-scm # sphinx paginate==0.5.6 # via mkdocs-material @@ -530,7 +531,7 @@ prompt-toolkit==3.0.39 # via # ipython # jupyter-console -protobuf==4.24.4 +protobuf==4.25.0 # via # dagster # grpcio-health-checking @@ -583,7 +584,7 @@ pymdown-extensions==10.3.1 # via # mkdocs-material # mkdocs-mermaid2-plugin -pymongo==4.5.0 +pymongo==4.6.0 # via # -r requirements/main.in # motor @@ -721,7 +722,7 @@ rpds-py==0.10.6 # referencing rsa==4.9 # via python-jose -ruamel-yaml==0.18.3 +ruamel-yaml==0.18.5 # via linkml-dataops ruamel-yaml-clib==0.2.8 # via ruamel-yaml @@ -731,6 +732,8 @@ semver==3.0.2 # via -r requirements/main.in send2trash==1.8.2 # via jupyter-server +setuptools-scm==8.0.4 + # via -r requirements/main.in shed==2023.6.1 # via terminusdb-client shexjsg==0.8.2 @@ -783,7 +786,7 @@ sphinxcontrib-qthelp==1.0.6 # via sphinx sphinxcontrib-serializinghtml==1.1.9 # via sphinx -sqlalchemy==2.0.22 +sqlalchemy==2.0.23 # via # alembic # dagster @@ -823,6 +826,7 @@ tomli==2.0.1 # jupyterlab # numpydoc # pytest + # setuptools-scm toolz==0.12.0 # via -r requirements/main.in toposort==1.10 @@ -874,6 +878,7 @@ typing-extensions==4.8.0 # prefixmaps # pydantic # pydantic-core + # setuptools-scm # sqlalchemy # typing-inspect # uvicorn @@ -937,3 +942,4 @@ setuptools==68.2.2 # via # dagster # mkdocs-mermaid2-plugin + # setuptools-scm From dc668eaa9efb210336947673a9df379ccaf2b9f1 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 3 Nov 2023 14:25:23 -0400 Subject: [PATCH 09/28] allow manual trigger --- .github/workflows/build-and-push-docker-images.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-and-push-docker-images.yml b/.github/workflows/build-and-push-docker-images.yml index 32be4213..3e6f7c15 100644 --- a/.github/workflows/build-and-push-docker-images.yml +++ b/.github/workflows/build-and-push-docker-images.yml @@ -1,5 +1,6 @@ name: build-and-push-docker-images on: + workflow_dispatch: push: branches: - main From 6ccc21470e94433f8c29fdd60b094871e78edac8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 3 Nov 2023 18:39:30 -0700 Subject: [PATCH 10/28] Implement preliminary migration notebook for `8.0.0` to `8.1.2` --- .../notebooks/.mongo.yaml.example | 8 + .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 490 ++++++++++++++++++ 2 files changed, 498 insertions(+) create mode 100644 demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb diff --git a/demo/metadata_migration/notebooks/.mongo.yaml.example b/demo/metadata_migration/notebooks/.mongo.yaml.example index a596e1f1..a1df2e02 100644 --- a/demo/metadata_migration/notebooks/.mongo.yaml.example +++ b/demo/metadata_migration/notebooks/.mongo.yaml.example @@ -15,6 +15,14 @@ # mongodb://root:pass@localhost:27017/?authSource=admin # ``` # +# Example: +# Assuming the same scenario as in the previous example, but without +# access control enabled (i.e. no username/password), +# the value of `uri` would be: +# ``` +# mongodb://localhost:27017/ +# ``` +# # Reference: # https://www.mongodb.com/docs/database-tools/mongodump/#std-option-mongodump.--uri # diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb new file mode 100644 index 00000000..83553b43 --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -0,0 +1,490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Migrate Mongo data from `nmdc-schema` [`v8.0.0`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.0.0) to [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Determine Mongo collections that will be transformed\n", + "\n", + "In this step, you will determine which Mongo collections will be transformed during this migration.\n", + "\n", + "1. In [`nmdc_schema/migration_recursion.py`](https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py), locate the Python class whose name reflects the initial and final version numbers of this migration.\n", + "2. In that Python class, locate the `self.agenda` dictionary.\n", + "3. In that dictionary, make a list of the keys—these are the names of the Mongo collections that will be transformed during this migration. For example:\n", + " ```py\n", + " self.agenda = dict(\n", + " collection_name_1=[self.some_function],\n", + " collection_name_2=[self.some_function],\n", + " )\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Coordinate with teammates that read/write to those collections\n", + "\n", + "In this step, you'll identify and reach out to the people that read/write to those collections; to agree on a migration schedule that works for you and them.\n", + "\n", + "Here's a table of Mongo collections and the components of the NMDC system that write to them (according to [a conversation that occurred on September 11, 2023](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK)).\n", + "\n", + "| Mongo collection | NMDC system components that write to it |\n", + "|---------------------------------------------|----------------------------------------------------------|\n", + "| `biosample_set` | Workflows (via manual entry via `nmdc-runtime` HTTP API) |\n", + "| `data_object_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `mags_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_annotation_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_assembly_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_based_taxonomy_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_qc_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `jobs` | Scheduler (via Mongo directly) |\n", + "| `*` | `nmdc-runtime` (via Mongo directly) |\n", + "\n", + "You can use that table to help determine which people read/write to those collections. You can then coordinate a migration time slot with them via Slack, email, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Setup a migration environment\n", + "\n", + "In this step, you'll set up an environment in which you can run this notebook.\n", + "\n", + "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n", + " 1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n", + " ```\n", + " > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use the `.notebook.env.example` file as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n", + " 1. You can use the `.mongo.yaml.example` file as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin Mongo server, use credentials that have write access to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends. You can do that by running this cell.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==8.1.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the Python objects is a Python class that is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library packages:\n", + "from pathlib import Path\n", + "from pprint import pformat\n", + "from shutil import rmtree\n", + "from tempfile import NamedTemporaryFile\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", + "\n", + "# First-party packages:\n", + "from helpers import Config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmatically determine which collections will be transformed\n", + "\n", + "Here are the names of the collections this migration will transform.\n", + "\n", + "> Ensure you have coordinated with the people that read/write to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agenda_collection_names = Migrator().agenda.keys()\n", + "\n", + "print(\"\\n\".join(agenda_collection_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create MongoDB clients\n", + "\n", + "Create MongoDB clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations).\n", + "\n", + "> Note: This cell includes a query (which retrieves version information) for each server, so we find out immediately if the server is accessible to the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# MongoDB client for origin MongoDB server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + "# MongoDB client for transformer MongoDB server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump collections from the origin Mongo server\n", + "\n", + "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n", + "\n", + "Since `mongodump` doesn't provide a CLI option that you can use to specify the collections you _want_ it to dump (unless that is only one collection), you can use a different CLI option to tell it all the collection you do _not_ want it to dump. The end result will be the same—there's just an extra step involved." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That extra step is to generate an `--excludeCollection=\"{name}\"` CLI option for each collection that is not on the agenda, which you'll do now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "\n", + "print(exclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongodump` command containing all those `--excludeCollection=\"{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the not-excluded collections from the origin database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {mongodump_exclude_collection_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Restore the dump into the transformer MongoDB server\n", + "\n", + "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n", + "\n", + "Since it's possible that the dump includes more collections than are on the agenda (due to someone creating a collection between the time you generated the exclusion list and the time you ran `mongodump`), you will use one or more of `mongorestore`'s `--nsInclude` CLI options to indicate which collections you want to load.\n", + "\n", + "Here's where you will generate the `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "\n", + "print(inclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongorestore` command containing all those `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Restore the dumped collections to the transformer MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform the database\n", + "\n", + "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n", + "\n", + "The transformation functions are provided by the `nmdc-schema` Python package.\n", + "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n", + "\n", + "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "migrator = Migrator()\n", + "\n", + "# Apply the transformations.\n", + "for collection_name, transformation_pipeline in migrator.agenda.items():\n", + " print(f\"Transforming documents in collection: {collection_name}\")\n", + " transformed_documents = []\n", + "\n", + " # Get each document from this collection.\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for original_document in collection.find():\n", + " \n", + " # Put the document through the transformation pipeline associated with this collection.\n", + " print(original_document)\n", + " transformed_document = original_document # initializes the variable\n", + " for transformation_function in transformation_pipeline:\n", + " transformed_document = transformation_function(transformed_document)\n", + "\n", + " # Store the transformed document.\n", + " print(transformed_document)\n", + " print(\"\")\n", + " transformed_documents.append(transformed_document)\n", + "\n", + " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", + " for transformed_document in transformed_documents:\n", + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Validate the transformed database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump the transformed database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the transformer MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {mongodump_exclude_collection_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Put the transformed data into the origin MongoDB server\n", + "\n", + "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that has/have the same name(s)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\ \n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Clean up\n", + "\n", + "Delete the temporary files and MongoDB dumps created by this notebook.\n", + "\n", + "> Note: You can skip this step, in case you want to delete them manually later (e.g. to examine them before deleting them)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "paths_to_files_to_delete = []\n", + "\n", + "paths_to_folders_to_delete = [\n", + " cfg.origin_dump_folder_path,\n", + " cfg.transformer_dump_folder_path,\n", + "]\n", + "\n", + "# Delete files.\n", + "for path in [Path(string) for string in paths_to_files_to_delete]:\n", + " try:\n", + " path.unlink()\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")\n", + "\n", + "# Delete folders.\n", + "for path in [Path(string) for string in paths_to_folders_to_delete]:\n", + " try:\n", + " rmtree(path)\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 5035c37204ab45334ee424c8bf092ff1ad8eb02c Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 3 Nov 2023 18:55:12 -0700 Subject: [PATCH 11/28] Remove unused imports and refine headings --- .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index 83553b43..a4d8a62e 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -128,9 +128,7 @@ "source": [ "# Standard library packages:\n", "from pathlib import Path\n", - "from pprint import pformat\n", "from shutil import rmtree\n", - "from tempfile import NamedTemporaryFile\n", "\n", "# Third-party packages:\n", "import pymongo\n", @@ -188,9 +186,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create MongoDB clients\n", + "### Create Mongo clients\n", "\n", - "Create MongoDB clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations).\n", + "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations).\n", "\n", "> Note: This cell includes a query (which retrieves version information) for each server, so we find out immediately if the server is accessible to the notebook." ] @@ -214,7 +212,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Dump collections from the origin Mongo server\n", + "### Dump collections from the \"origin\" Mongo server\n", "\n", "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n", "\n", @@ -269,7 +267,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Restore the dump into the transformer MongoDB server\n", + "### Load the collections into the \"transformer\" Mongo server\n", "\n", "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n", "\n", @@ -317,7 +315,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Transform the database\n", + "### Transform the collections within the \"transformer\" Mongo server\n", "\n", "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n", "\n", @@ -364,7 +362,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Validate the transformed database" + "### Validate the transformed collections" ] }, { @@ -380,7 +378,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Dump the transformed database" + "### Dump the transformed collections" ] }, { @@ -402,9 +400,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Put the transformed data into the origin MongoDB server\n", + "### Load the transformed data into the \"origin\" Mongo server\n", "\n", - "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that has/have the same name(s)." + "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that have the same name(s)." ] }, { From 775187f0f17daf66c1ef3e6547eac7a66e7efd7e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 3 Nov 2023 20:20:42 -0700 Subject: [PATCH 12/28] Perform JSON Schema-based validation on transformed document --- .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 92 ++++++++++++++----- .../notebooks/requirements.txt | 1 + 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index a4d8a62e..d3017f78 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -129,9 +129,11 @@ "# Standard library packages:\n", "from pathlib import Path\n", "from shutil import rmtree\n", + "from jsonschema import Draft7Validator\n", "\n", "# Third-party packages:\n", "import pymongo\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", "\n", "# First-party packages:\n", @@ -179,7 +181,27 @@ "\n", "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", "mongodump = cfg.mongodump_path\n", - "mongorestore = cfg.mongorestore_path" + "mongorestore = cfg.mongorestore_path\n", + "\n", + "print(mongodump)\n", + "print(mongorestore)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the application paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!{mongodump} --version\n", + "!{mongorestore} --version" ] }, { @@ -199,13 +221,38 @@ "metadata": {}, "outputs": [], "source": [ - "# MongoDB client for origin MongoDB server.\n", + "# Mongo client for origin Mongo server.\n", "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", "\n", - "# MongoDB client for transformer MongoDB server.\n", + "# Mongo client for transformer Mongo server.\n", "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", - "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])" + "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Confirm the transformer Mongo server does not contain an \"nmdc\" database.\n", + "# Note: Raises an `AssertionError` if the asserted expression isn't True.\n", + "assert \"nmdc\" not in transformer_mongo_client.list_database_names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", + "\n", + "print(nmdc_jsonschema_validator.validate({}) is None) # sanity test: no keys -> no errors" ] }, { @@ -260,7 +307,7 @@ " --db=\"nmdc\" \\\n", " --gzip \\\n", " --out=\"{cfg.origin_dump_folder_path}\" \\\n", - " {mongodump_exclude_collection_options_str}" + " {exclusion_options_str}" ] }, { @@ -322,7 +369,9 @@ "The transformation functions are provided by the `nmdc-schema` Python package.\n", "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n", "\n", - "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**" + "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n", + "\n", + "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" ] }, { @@ -333,6 +382,8 @@ "source": [ "migrator = Migrator()\n", "\n", + "ids_of_invalid_transformed_documents = dict() # key is collection name, value is a list of IDs of invalid transformed documents\n", + "\n", "# Apply the transformations.\n", "for collection_name, transformation_pipeline in migrator.agenda.items():\n", " print(f\"Transforming documents in collection: {collection_name}\")\n", @@ -353,25 +404,18 @@ " print(\"\")\n", " transformed_documents.append(transformed_document)\n", "\n", + " # Validate the transformed document.\n", + " # See: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md#schema-validation\n", + " if not nmdc_jsonschema_validator.is_valid(transformed_document):\n", + " if collection_name not in ids_of_invalid_transformed_documents:\n", + " ids_of_invalid_transformed_documents[collection_name] = []\n", + " ids_of_invalid_transformed_documents[collection_name].append(transformed_document[\"id\"])\n", + "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Validate the transformed collections" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO" + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n", + "\n", + "print(f\"{ids_of_invalid_transformed_documents=}\")\n" ] }, { @@ -393,7 +437,7 @@ " --db=\"nmdc\" \\\n", " --gzip \\\n", " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", - " {mongodump_exclude_collection_options_str}" + " {exclusion_options_str}" ] }, { diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 4944b3bd..516a0401 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -1,3 +1,4 @@ +jsonschema==4.19.2 pymongo==4.5.0 python-dotenv==1.0.0 PyYAML==6.0.1 \ No newline at end of file From f61fad48e91bb942f36c22a41d28492a8b492c86 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 4 Nov 2023 11:56:38 -0700 Subject: [PATCH 13/28] Fix validation code so document is validated against correct part of schema --- .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 88 ++++++++++++++----- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index d3017f78..29d63658 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -159,6 +159,7 @@ "source": [ "agenda_collection_names = Migrator().agenda.keys()\n", "\n", + "print(\"The following collections will be transformed:\")\n", "print(\"\\n\".join(agenda_collection_names))" ] }, @@ -181,10 +182,7 @@ "\n", "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", "mongodump = cfg.mongodump_path\n", - "mongorestore = cfg.mongorestore_path\n", - "\n", - "print(mongodump)\n", - "print(mongorestore)" + "mongorestore = cfg.mongorestore_path" ] }, { @@ -210,9 +208,7 @@ "source": [ "### Create Mongo clients\n", "\n", - "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations).\n", - "\n", - "> Note: This cell includes a query (which retrieves version information) for each server, so we find out immediately if the server is accessible to the notebook." + "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations)." ] }, { @@ -223,15 +219,35 @@ "source": [ "# Mongo client for origin Mongo server.\n", "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", - "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", "\n", "# Mongo client for transformer Mongo server.\n", - "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the Mongo clients' abilities to access their respective Mongo servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", + "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the origin database exists.\n", + "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", "\n", - "# Confirm the transformer Mongo server does not contain an \"nmdc\" database.\n", - "# Note: Raises an `AssertionError` if the asserted expression isn't True.\n", - "assert \"nmdc\" not in transformer_mongo_client.list_database_names()" + "# Sanity test: Ensure the transformation database does not exist.\n", + "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" ] }, { @@ -250,9 +266,28 @@ "outputs": [], "source": [ "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", - "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "\n", + "> Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", - "print(nmdc_jsonschema_validator.validate({}) is None) # sanity test: no keys -> no errors" + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" ] }, { @@ -382,8 +417,6 @@ "source": [ "migrator = Migrator()\n", "\n", - "ids_of_invalid_transformed_documents = dict() # key is collection name, value is a list of IDs of invalid transformed documents\n", - "\n", "# Apply the transformations.\n", "for collection_name, transformation_pipeline in migrator.agenda.items():\n", " print(f\"Transforming documents in collection: {collection_name}\")\n", @@ -405,17 +438,24 @@ " transformed_documents.append(transformed_document)\n", "\n", " # Validate the transformed document.\n", - " # See: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md#schema-validation\n", - " if not nmdc_jsonschema_validator.is_valid(transformed_document):\n", - " if collection_name not in ids_of_invalid_transformed_documents:\n", - " ids_of_invalid_transformed_documents[collection_name] = []\n", - " ids_of_invalid_transformed_documents[collection_name].append(transformed_document[\"id\"])\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", - " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n", - "\n", - "print(f\"{ids_of_invalid_transformed_documents=}\")\n" + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" ] }, { From bcaad732a61a7aa06a086e08e10272f1191dafac Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 4 Nov 2023 12:18:11 -0700 Subject: [PATCH 14/28] Validate transformed document before appending it to list --- .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index 29d63658..d308fcc3 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -431,11 +431,7 @@ " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", " transformed_document = transformation_function(transformed_document)\n", - "\n", - " # Store the transformed document.\n", " print(transformed_document)\n", - " print(\"\")\n", - " transformed_documents.append(transformed_document)\n", "\n", " # Validate the transformed document.\n", " #\n", @@ -453,6 +449,10 @@ " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", "\n", + " # Store the transformed document.\n", + " transformed_documents.append(transformed_document) \n", + " print(\"\") \n", + "\n", " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", " for transformed_document in transformed_documents:\n", " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" From ee3ea0e6d629a686dc041f545dccde6ef026ea7d Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 5 Nov 2023 14:12:29 -0800 Subject: [PATCH 15/28] Print a before-and-after diff of each modified document --- .../notebooks/migrate_8_0_0_to_8_1_2.ipynb | 19 ++++++++++++++++--- .../notebooks/requirements.txt | 1 + 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb index d308fcc3..67ee6038 100644 --- a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -129,12 +129,14 @@ "# Standard library packages:\n", "from pathlib import Path\n", "from shutil import rmtree\n", - "from jsonschema import Draft7Validator\n", + "from copy import deepcopy\n", "\n", "# Third-party packages:\n", "import pymongo\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", + "from jsonschema import Draft7Validator\n", + "from dictdiffer import diff\n", "\n", "# First-party packages:\n", "from helpers import Config" @@ -406,7 +408,9 @@ "\n", "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n", "\n", - "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" + "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", + "\n", + "> Note: This step also include a before-and-after comparison to facilitate manual spot checks. References: https://docs.python.org/3/library/copy.html#copy.deepcopy and https://dictdiffer.readthedocs.io/" ] }, { @@ -425,13 +429,22 @@ " # Get each document from this collection.\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", " for original_document in collection.find():\n", + " # Make a deep copy of the original document, to enable before-and-after comparison.\n", + " print(original_document)\n", + " copy_of_original_document = deepcopy(original_document)\n", " \n", " # Put the document through the transformation pipeline associated with this collection.\n", - " print(original_document)\n", " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", " transformed_document = transformation_function(transformed_document)\n", " print(transformed_document)\n", + " \n", + " # Compare the transformed document with a copy of the original document;\n", + " # and, if there are any differences, print those differences.\n", + " difference = diff(copy_of_original_document, transformed_document)\n", + " differences = list(difference)\n", + " if len(differences) > 0:\n", + " print(f\"✏️ {differences}\")\n", "\n", " # Validate the transformed document.\n", " #\n", diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 516a0401..4125ed9f 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -1,3 +1,4 @@ +dictdiffer==0.9.0 jsonschema==4.19.2 pymongo==4.5.0 python-dotenv==1.0.0 From ebe41c9874ccf0326b925462afd6f8f672b6671e Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 6 Nov 2023 15:15:31 -0500 Subject: [PATCH 16/28] fix: use @field_serializer for pydantic `Url`s (#363) * fix: use @field_serializer for pydantic `Url`s ensures `model_dump` preserves python `datetime`s, which are handled by pymongo. reverts mode='json' option to model_dump, to preserve datetimes. fixes #349 * fix: preserve exclude_unset * style: remove `print` for debugging * fix: remove stale comment --- .../workflow_execution_activity/core.py | 4 +- nmdc_runtime/api/endpoints/objects.py | 6 +-- nmdc_runtime/api/endpoints/operations.py | 4 +- nmdc_runtime/api/endpoints/queries.py | 10 ++--- nmdc_runtime/api/endpoints/runs.py | 6 +-- nmdc_runtime/api/endpoints/search.py | 4 +- nmdc_runtime/api/endpoints/sites.py | 12 +----- nmdc_runtime/api/endpoints/users.py | 22 +++-------- nmdc_runtime/api/endpoints/util.py | 18 +++------ nmdc_runtime/api/main.py | 10 ++--- nmdc_runtime/api/models/object.py | 23 +++++++++++ nmdc_runtime/api/models/operation.py | 6 ++- nmdc_runtime/api/models/run.py | 18 ++------- nmdc_runtime/minter/adapters/repository.py | 13 +------ .../minter/entrypoints/fastapi_app.py | 4 +- nmdc_runtime/site/drsobjects/ingest.py | 6 +-- nmdc_runtime/site/ops.py | 15 ++------ nmdc_runtime/site/repository.py | 4 +- nmdc_runtime/site/resources.py | 4 +- tests/integration/test_minter_repository.py | 38 +++---------------- tests/test_api/test_endpoints.py | 12 +++--- tests/test_graphs/ensure_jobs.py | 26 +++++++++++++ 22 files changed, 106 insertions(+), 159 deletions(-) create mode 100644 tests/test_graphs/ensure_jobs.py diff --git a/components/nmdc_runtime/workflow_execution_activity/core.py b/components/nmdc_runtime/workflow_execution_activity/core.py index ad7d020d..43236234 100644 --- a/components/nmdc_runtime/workflow_execution_activity/core.py +++ b/components/nmdc_runtime/workflow_execution_activity/core.py @@ -94,9 +94,7 @@ def insert_into_keys( workflow: Workflow, data_objects: list[DataObject] ) -> dict[str, Any]: """Insert data object url into correct workflow input field.""" - workflow_dict = workflow.model_dump( - mode="json", - ) + workflow_dict = workflow.model_dump() for key in workflow_dict["inputs"]: for do in data_objects: if workflow_dict["inputs"][key] == str(do.data_object_type): diff --git a/nmdc_runtime/api/endpoints/objects.py b/nmdc_runtime/api/endpoints/objects.py index 0dd3b443..706f9049 100644 --- a/nmdc_runtime/api/endpoints/objects.py +++ b/nmdc_runtime/api/endpoints/objects.py @@ -78,7 +78,7 @@ def create_object( """ id_supplied = supplied_object_id( - mdb, client_site, object_in.model_dump(mode="json", exclude_unset=True) + mdb, client_site, object_in.model_dump(exclude_unset=True) ) drs_id = local_part( id_supplied if id_supplied is not None else generate_one_id(mdb, S3_ID_NS) @@ -255,9 +255,7 @@ def update_object( status_code=status.HTTP_403_FORBIDDEN, detail=f"client authorized for different site_id than {object_mgr_site}", ) - doc_object_patched = merge( - doc, object_patch.model_dump(mode="json", exclude_unset=True) - ) + doc_object_patched = merge(doc, object_patch.model_dump(exclude_unset=True)) mdb.operations.replace_one({"id": object_id}, doc_object_patched) return doc_object_patched diff --git a/nmdc_runtime/api/endpoints/operations.py b/nmdc_runtime/api/endpoints/operations.py index ecb4d33e..c6bcccf2 100644 --- a/nmdc_runtime/api/endpoints/operations.py +++ b/nmdc_runtime/api/endpoints/operations.py @@ -61,13 +61,13 @@ def update_operation( detail=f"client authorized for different site_id than {site_id_op}", ) op_patch_metadata = merge( - op_patch.model_dump(mode="json", exclude_unset=True).get("metadata", {}), + op_patch.model_dump(exclude_unset=True).get("metadata", {}), pick(["site_id", "job", "model"], doc_op.get("metadata", {})), ) doc_op_patched = merge( doc_op, assoc( - op_patch.model_dump(mode="json", exclude_unset=True), + op_patch.model_dump(exclude_unset=True), "metadata", op_patch_metadata, ), diff --git a/nmdc_runtime/api/endpoints/queries.py b/nmdc_runtime/api/endpoints/queries.py index 11417698..3d57166a 100644 --- a/nmdc_runtime/api/endpoints/queries.py +++ b/nmdc_runtime/api/endpoints/queries.py @@ -75,9 +75,9 @@ def run_query( id=qid, saved_at=saved_at, ) - mdb.queries.insert_one(query.model_dump(mode="json", exclude_unset=True)) + mdb.queries.insert_one(query.model_dump(exclude_unset=True)) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.model_dump(mode="json", exclude_unset=True)) + return unmongo(cmd_response.model_dump(exclude_unset=True)) @router.get("/queries/{query_id}", response_model=Query) @@ -107,7 +107,7 @@ def rerun_query( check_can_delete(user) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.model_dump(mode="json", exclude_unset=True)) + return unmongo(cmd_response.model_dump(exclude_unset=True)) def _run_query(query, mdb) -> CommandResponse: @@ -131,12 +131,12 @@ def _run_query(query, mdb) -> CommandResponse: detail="Failed to back up to-be-deleted documents. operation aborted.", ) - q_response = mdb.command(query.cmd.model_dump(mode="json", exclude_unset=True)) + q_response = mdb.command(query.cmd.model_dump(exclude_unset=True)) cmd_response: CommandResponse = command_response_for(q_type)(**q_response) query_run = ( QueryRun(qid=query.id, ran_at=ran_at, result=cmd_response) if cmd_response.ok else QueryRun(qid=query.id, ran_at=ran_at, error=cmd_response) ) - mdb.query_runs.insert_one(query_run.model_dump(mode="json", exclude_unset=True)) + mdb.query_runs.insert_one(query_run.model_dump(exclude_unset=True)) return cmd_response diff --git a/nmdc_runtime/api/endpoints/runs.py b/nmdc_runtime/api/endpoints/runs.py index 8bd9f22d..7c41ad84 100644 --- a/nmdc_runtime/api/endpoints/runs.py +++ b/nmdc_runtime/api/endpoints/runs.py @@ -94,9 +94,5 @@ def post_run_event( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Supplied run_event.run.id does not match run_id given in request URL.", ) - mdb.run_events.insert_one( - run_event.model_dump( - mode="json", - ) - ) + mdb.run_events.insert_one(run_event.model_dump()) return _get_run_summary(run_event.run.id, mdb) diff --git a/nmdc_runtime/api/endpoints/search.py b/nmdc_runtime/api/endpoints/search.py index 4813f7d2..b48c411e 100644 --- a/nmdc_runtime/api/endpoints/search.py +++ b/nmdc_runtime/api/endpoints/search.py @@ -25,9 +25,7 @@ def data_objects( req: DataObjectListRequest = Depends(), mdb: MongoDatabase = Depends(get_mongo_db), ): - filter_ = list_request_filter_to_mongo_filter( - req.model_dump(mode="json", exclude_unset=True) - ) + filter_ = list_request_filter_to_mongo_filter(req.model_dump(exclude_unset=True)) max_page_size = filter_.pop("max_page_size", None) page_token = filter_.pop("page_token", None) req = ListRequest( diff --git a/nmdc_runtime/api/endpoints/sites.py b/nmdc_runtime/api/endpoints/sites.py index 76adfdc1..9e587d03 100644 --- a/nmdc_runtime/api/endpoints/sites.py +++ b/nmdc_runtime/api/endpoints/sites.py @@ -56,11 +56,7 @@ def create_site( status_code=status.HTTP_409_CONFLICT, detail=f"site with supplied id {site.id} already exists", ) - mdb.sites.insert_one( - site.model_dump( - mode="json", - ) - ) + mdb.sites.insert_one(site.model_dump()) refresh_minter_requesters_from_sites() rv = mdb.users.update_one( {"username": user.username}, @@ -169,11 +165,7 @@ def put_object_in_site( }, } ) - mdb.operations.insert_one( - op.model_dump( - mode="json", - ) - ) + mdb.operations.insert_one(op.model_dump()) return op diff --git a/nmdc_runtime/api/endpoints/users.py b/nmdc_runtime/api/endpoints/users.py index 587ad453..5799ca3c 100644 --- a/nmdc_runtime/api/endpoints/users.py +++ b/nmdc_runtime/api/endpoints/users.py @@ -35,11 +35,7 @@ async def login_for_access_token( detail="Incorrect username or password", headers={"WWW-Authenticate": "Bearer"}, ) - access_token_expires = timedelta( - **ACCESS_TOKEN_EXPIRES.model_dump( - mode="json", - ) - ) + access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.model_dump()) access_token = create_access_token( data={"sub": f"user:{user.username}"}, expires_delta=access_token_expires ) @@ -54,11 +50,7 @@ async def login_for_access_token( headers={"WWW-Authenticate": "Bearer"}, ) # TODO make below an absolute time - access_token_expires = timedelta( - **ACCESS_TOKEN_EXPIRES.model_dump( - mode="json", - ) - ) + access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.model_dump()) access_token = create_access_token( data={"sub": f"client:{form_data.client_id}"}, expires_delta=access_token_expires, @@ -66,9 +58,7 @@ async def login_for_access_token( return { "access_token": access_token, "token_type": "bearer", - "expires": ACCESS_TOKEN_EXPIRES.model_dump( - mode="json", - ), + "expires": ACCESS_TOKEN_EXPIRES.model_dump(), } @@ -94,10 +84,8 @@ def create_user( check_can_create_user(requester) mdb.users.insert_one( UserInDB( - **user_in.model_dump( - mode="json", - ), + **user_in.model_dump(), hashed_password=get_password_hash(user_in.password), - ).model_dump(mode="json", exclude_unset=True) + ).model_dump(exclude_unset=True) ) return mdb.users.find_one({"username": user_in.username}) diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 9b228bda..fd473682 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -454,11 +454,11 @@ def _create_object( mdb: MongoDatabase, object_in: DrsObjectIn, mgr_site, drs_id, self_uri ): drs_obj = DrsObject( - **object_in.model_dump(exclude_unset=True, mode="json"), + **object_in.model_dump(exclude_unset=True), id=drs_id, self_uri=self_uri, ) - doc = drs_obj.model_dump(exclude_unset=True, mode="json") + doc = drs_obj.model_dump(exclude_unset=True) doc["_mgr_site"] = mgr_site # manager site try: mdb.objects.insert_one(doc) @@ -519,22 +519,16 @@ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site): "workflow": job.workflow, "config": job.config, } - ).model_dump(mode="json", exclude_unset=True), + ).model_dump(exclude_unset=True), "site_id": site.id, "model": dotted_path_for(JobOperationMetadata), }, } ) - mdb.operations.insert_one( - op.model_dump( - mode="json", - ) - ) - mdb.jobs.replace_one( - {"id": job.id}, job.model_dump(mode="json", exclude_unset=True) - ) + mdb.operations.insert_one(op.model_dump()) + mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True)) - return op.model_dump(mode="json", exclude_unset=True) + return op.model_dump(exclude_unset=True) @lru_cache diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 2a348aa1..71a6863e 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -234,9 +234,7 @@ def ensure_initial_resources_on_boot(): collection_boot = import_module(f"nmdc_runtime.api.boot.{collection_name}") for model in collection_boot.construct(): - doc = model.model_dump( - mode="json", - ) + doc = model.model_dump() mdb[collection_name].replace_one({"id": doc["id"]}, doc, upsert=True) username = os.getenv("API_ADMIN_USER") @@ -248,7 +246,7 @@ def ensure_initial_resources_on_boot(): username=username, hashed_password=get_password_hash(os.getenv("API_ADMIN_PASS")), site_admin=[os.getenv("API_SITE_ID")], - ).model_dump(mode="json", exclude_unset=True), + ).model_dump(exclude_unset=True), upsert=True, ) mdb.users.create_index("username") @@ -269,9 +267,7 @@ def ensure_initial_resources_on_boot(): ), ) ], - ).model_dump( - mode="json", - ), + ).model_dump(), upsert=True, ) diff --git a/nmdc_runtime/api/models/object.py b/nmdc_runtime/api/models/object.py index 26af100c..17df772c 100644 --- a/nmdc_runtime/api/models/object.py +++ b/nmdc_runtime/api/models/object.py @@ -12,6 +12,7 @@ BaseModel, AnyUrl, HttpUrl, + field_serializer, ) from typing_extensions import Annotated @@ -31,6 +32,10 @@ class AccessURL(BaseModel): headers: Optional[Dict[str, str]] = None url: AnyUrl + @field_serializer("url") + def serialize_url(self, url: AnyUrl, _info): + return str(url) + class AccessMethod(BaseModel): access_id: Optional[Annotated[str, StringConstraints(min_length=1)]] = None @@ -78,6 +83,12 @@ def no_contents_means_single_blob(cls, values): raise ValueError("no contents means no further nesting, so id required") return values + @field_serializer("drs_uri") + def serialize_url(self, drs_uri: Optional[List[AnyUrl]], _info): + if drs_uri is not None and len(drs_uri) > 0: + return [str(u) for u in drs_uri] + return drs_uri + ContentsObject.update_forward_refs() @@ -127,6 +138,10 @@ class DrsObject(DrsObjectIn): id: DrsId self_uri: AnyUrl + @field_serializer("self_uri") + def serialize_url(self, self_uri: AnyUrl, _info): + return str(self_uri) + Seconds = Annotated[int, Field(strict=True, gt=0)] @@ -135,6 +150,10 @@ class ObjectPresignedUrl(BaseModel): url: HttpUrl expires_in: Seconds = 300 + @field_serializer("url") + def serialize_url(self, url: HttpUrl, _info): + return str(url) + class DrsObjectOutBase(DrsObjectBase): checksums: List[Checksum] @@ -145,6 +164,10 @@ class DrsObjectOutBase(DrsObjectBase): updated_time: Optional[datetime.datetime] = None version: Optional[str] = None + @field_serializer("self_uri") + def serialize_url(self, slf_uri: AnyUrl, _info): + return str(self_uri) + class DrsObjectBlobOut(DrsObjectOutBase): access_methods: List[AccessMethod] diff --git a/nmdc_runtime/api/models/operation.py b/nmdc_runtime/api/models/operation.py index e1819f24..035100cd 100644 --- a/nmdc_runtime/api/models/operation.py +++ b/nmdc_runtime/api/models/operation.py @@ -1,7 +1,7 @@ import datetime from typing import Generic, TypeVar, Optional, List, Any, Union -from pydantic import StringConstraints, BaseModel, HttpUrl +from pydantic import StringConstraints, BaseModel, HttpUrl, field_serializer from nmdc_runtime.api.models.util import ResultT from typing_extensions import Annotated @@ -59,3 +59,7 @@ class ObjectPutMetadata(Metadata): site_id: str url: HttpUrl expires_in_seconds: int + + @field_serializer("url") + def serialize_url(self, url: HttpUrl, _info): + return str(url) diff --git a/nmdc_runtime/api/models/run.py b/nmdc_runtime/api/models/run.py index 43bf734e..3cdf43d1 100644 --- a/nmdc_runtime/api/models/run.py +++ b/nmdc_runtime/api/models/run.py @@ -93,11 +93,7 @@ def _add_run_requested_event(run_spec: RunUserSpec, mdb: MongoDatabase, user: Us time=now(as_str=True), inputs=run_spec.inputs, ) - mdb.run_events.insert_one( - event.model_dump( - mode="json", - ) - ) + mdb.run_events.insert_one(event.model_dump()) return run_id @@ -117,9 +113,7 @@ def _add_run_started_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.STARTED, time=now(as_str=True), - ).model_dump( - mode="json", - ) + ).model_dump() ) return run_id @@ -140,9 +134,7 @@ def _add_run_fail_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.FAIL, time=now(as_str=True), - ).model_dump( - mode="json", - ) + ).model_dump() ) return run_id @@ -164,8 +156,6 @@ def _add_run_complete_event(run_id: str, mdb: MongoDatabase, outputs: List[str]) type=RunEventType.COMPLETE, time=now(as_str=True), outputs=outputs, - ).model_dump( - mode="json", - ) + ).model_dump() ) return run_id diff --git a/nmdc_runtime/minter/adapters/repository.py b/nmdc_runtime/minter/adapters/repository.py index 879bdcc8..96775c92 100644 --- a/nmdc_runtime/minter/adapters/repository.py +++ b/nmdc_runtime/minter/adapters/repository.py @@ -97,9 +97,7 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) ) for id_ in ids: - self.db[id_.id] = id_.model_dump( - mode="json", - ) + self.db[id_.id] = id_.model_dump() return ids def bind(self, req_bind: BindingRequest) -> Identifier: @@ -186,14 +184,7 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) for id_name in not_taken ] - self.db["minter.id_records"].insert_many( - [ - i.model_dump( - mode="json", - ) - for i in ids - ] - ) + self.db["minter.id_records"].insert_many([i.model_dump() for i in ids]) collected.extend(ids) if len(collected) == req_mint.how_many: break diff --git a/nmdc_runtime/minter/entrypoints/fastapi_app.py b/nmdc_runtime/minter/entrypoints/fastapi_app.py index 3d4b7efc..996cb575 100644 --- a/nmdc_runtime/minter/entrypoints/fastapi_app.py +++ b/nmdc_runtime/minter/entrypoints/fastapi_app.py @@ -40,9 +40,7 @@ def mint_ids( MintingRequest( service=service, requester=requester, - **req_mint.model_dump( - mode="json", - ), + **req_mint.model_dump(), ) ) return [d.id for d in minted] diff --git a/nmdc_runtime/site/drsobjects/ingest.py b/nmdc_runtime/site/drsobjects/ingest.py index 26a26f43..42eeb8a5 100644 --- a/nmdc_runtime/site/drsobjects/ingest.py +++ b/nmdc_runtime/site/drsobjects/ingest.py @@ -44,11 +44,7 @@ def claim_metadata_ingest_jobs( ) jobs = [] while True: - rv = client.list_jobs( - lr.model_dump( - mode="json", - ) - ).json() + rv = client.list_jobs(lr.model_dump()).json() jobs.extend(rv["resources"]) if "next_page_token" not in rv: break diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 265498ab..eecd4665 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -267,11 +267,7 @@ def get_operation(context): def produce_curated_db(context, op: Operation): client: RuntimeApiSiteClient = context.resources.runtime_api_site_client mdb: MongoDatabase = context.resources.mongo.db - op = Operation[ResultT, JobOperationMetadata]( - **op.model_dump( - mode="json", - ) - ) + op = Operation[ResultT, JobOperationMetadata](**op.model_dump()) op_meta: JobOperationMetadata = op.metadata job_id = op_meta.job.id job = mdb.jobs.find_one({"id": job_id}) @@ -354,12 +350,7 @@ def filter_ops_undone_expired() -> str: @op(required_resource_keys={"runtime_api_site_client"}) def list_operations(context, filter_: str) -> list: client = context.resources.runtime_api_site_client - ops = [ - op.model_dump( - mode="json", - ) - for op in client.list_operations({"filter": filter_}) - ] + ops = [op.model_dump() for op in client.list_operations({"filter": filter_})] context.log.info(str(len(ops))) return ops @@ -475,7 +466,7 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn): op = Operation(**mdb.operations.find_one({"id": op_id})) op.done = True op.result = {"update_cmd": json.dumps(update_cmd)} - op_doc = op.model_dump(mode="json", exclude_unset=True) + op_doc = op.model_dump(exclude_unset=True) mdb.operations.replace_one({"id": op_id}, op_doc) return ["/operations/" + op_doc["id"]] diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 22b9a9c0..bbd7ae5e 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -405,9 +405,7 @@ def claim_and_run_apply_changesheet_jobs(_context): def done_object_put_ops(_context): client = get_runtime_api_site_client(run_config_frozen__normal_env) ops = [ - op.model_dump( - mode="json", - ) + op.model_dump() for op in client.list_operations( { "filter": json.dumps( diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 983ab206..5f857a77 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -60,9 +60,7 @@ def request(self, method, url_path, params_or_json_data=None): self.ensure_token() kwargs = {"url": self.base_url + url_path, "headers": self.headers} if isinstance(params_or_json_data, BaseModel): - params_or_json_data = params_or_json_data.model_dump( - mode="json", exclude_unset=True - ) + params_or_json_data = params_or_json_data.model_dump(exclude_unset=True) if method.upper() == "GET": kwargs["params"] = params_or_json_data else: diff --git a/tests/integration/test_minter_repository.py b/tests/integration/test_minter_repository.py index 45ad0b56..2524ade4 100644 --- a/tests/integration/test_minter_repository.py +++ b/tests/integration/test_minter_repository.py @@ -31,9 +31,7 @@ def test_mint_and_resolve(): id_: Identifier = next(i for i in s.mint(req_mint)) req_res = ResolutionRequest( id_name=id_.name, - **req_mint.model_dump( - mode="json", - ), + **req_mint.model_dump(), ) assert s.resolve(req_res) is not None @@ -44,21 +42,10 @@ def test_mint_and_delete(): id_: Identifier = next(i for i in s.mint(req_mint)) req_del = DeleteRequest( id_name=id_.name, - **req_mint.model_dump( - mode="json", - ), + **req_mint.model_dump(), ) s.delete(req_del) - assert ( - s.resolve( - ResolutionRequest( - **req_del.model_dump( - mode="json", - ) - ) - ) - is None - ) + assert s.resolve(ResolutionRequest(**req_del.model_dump())) is None def test_mongo_mint_one(): @@ -91,9 +78,7 @@ def test_mongo_mint_and_resolve(): id_: Identifier = next(i for i in s.mint(req_mint)) req_res = ResolutionRequest( id_name=id_.name, - **req_mint.model_dump( - mode="json", - ), + **req_mint.model_dump(), ) assert s.resolve(req_res) is not None @@ -106,19 +91,8 @@ def test_mongo_mint_and_delete(): id_: Identifier = next(i for i in s.mint(req_mint)) req_del = DeleteRequest( id_name=id_.name, - **req_mint.model_dump( - mode="json", - ), + **req_mint.model_dump(), ) s.delete(req_del) - assert ( - s.resolve( - ResolutionRequest( - **req_del.model_dump( - mode="json", - ) - ) - ) - is None - ) + assert s.resolve(ResolutionRequest(**req_del.model_dump())) is None assert s.db["minter.id_records"].count_documents({}) == 0 diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 387154b0..68a597ff 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -32,7 +32,7 @@ def ensure_test_resources(mdb): username=username, hashed_password=get_password_hash(password), site_admin=[site_id], - ).model_dump(mode="json", exclude_unset=True), + ).model_dump(exclude_unset=True), upsert=True, ) @@ -48,9 +48,7 @@ def ensure_test_resources(mdb): hashed_secret=get_password_hash(client_secret), ) ], - ).model_dump( - mode="json", - ), + ).model_dump(), upsert=True, ) wf_id = "test" @@ -59,7 +57,7 @@ def ensure_test_resources(mdb): mdb.operations.delete_many(prev_ops) job = Job(**{"id": job_id, "workflow": {"id": wf_id}, "config": {}, "claims": []}) mdb.jobs.replace_one( - {"id": job_id}, job.model_dump(mode="json", exclude_unset=True), upsert=True + {"id": job_id}, job.model_dump(exclude_unset=True), upsert=True ) return { "site_client": { @@ -68,7 +66,7 @@ def ensure_test_resources(mdb): "client_secret": client_secret, }, "user": {"username": username, "password": password}, - "job": job.model_dump(mode="json", exclude_unset=True), + "job": job.model_dump(exclude_unset=True), } @@ -124,7 +122,7 @@ def get_token(): "POST", url=(base_url + "/users"), headers=headers, - json=user_in.model_dump(mode="json", exclude_unset=True), + json=user_in.model_dump(exclude_unset=True), ) try: diff --git a/tests/test_graphs/ensure_jobs.py b/tests/test_graphs/ensure_jobs.py new file mode 100644 index 00000000..5a7359dd --- /dev/null +++ b/tests/test_graphs/ensure_jobs.py @@ -0,0 +1,26 @@ +import pytest +from toolz import merge + +from nmdc_runtime.site.graphs import ensure_jobs +from nmdc_runtime.site.repository import preset_normal + + +@pytest.skip("Needs supplied state") +def test_ensure_jobs(): + job = ensure_jobs.to_job(name="test_ensure_jobs", **preset_normal) + run_config = merge({}, preset_normal["config"]) + run_config["ops"] = { + "construct_jobs": { + "config": { + "base_jobs": [ + { + "config": {"object_id": "gfs03r29"}, + "workflow": {"id": "apply-changesheet-1.0"}, + } + ] + } + } + } + result = job.execute_in_process(run_config=run_config) + + assert result.success From b6fc6ce6f3863151cb22c4b6a8d15130c162992e Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 7 Nov 2023 10:51:13 -0500 Subject: [PATCH 17/28] refactor GH actions --- .github/workflows/build-and-push-docker-images.yml | 6 ++++++ .github/workflows/{release.yml => release-to-pypi.yml} | 0 2 files changed, 6 insertions(+) rename .github/workflows/{release.yml => release-to-pypi.yml} (100%) diff --git a/.github/workflows/build-and-push-docker-images.yml b/.github/workflows/build-and-push-docker-images.yml index 3e6f7c15..5ef44f46 100644 --- a/.github/workflows/build-and-push-docker-images.yml +++ b/.github/workflows/build-and-push-docker-images.yml @@ -4,6 +4,12 @@ on: push: branches: - main + paths: + - '.github/workflows/build-and-push-docker-images.yml' + - 'Makefile' + - '**.Dockerfile' + - '**.py' + - 'requirements/main.txt' jobs: docker: diff --git a/.github/workflows/release.yml b/.github/workflows/release-to-pypi.yml similarity index 100% rename from .github/workflows/release.yml rename to .github/workflows/release-to-pypi.yml From 5f46aa39aaf9bbce5f8e46c50dbc601965568bd8 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 7 Nov 2023 11:27:10 -0500 Subject: [PATCH 18/28] hotfix: api version display --- nmdc_runtime/api/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 71a6863e..007f3cc5 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -351,7 +351,9 @@ async def get_versions(): app = FastAPI( title="NMDC Runtime API", - version=get_version(), + # TODO this does not work: `version=get_version()` + # Below is hotfix for reasonable display in prod deployment. + version="1.0.7", description=( "The NMDC Runtime API, via on-demand functions " "and via schedule-based and sensor-based automation, " From 3e1f2bf20cf62078cac823be6f9bad7712824220 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 14:12:48 -0800 Subject: [PATCH 19/28] Create notebook for migrating data from `nmdc-schema` `8.1.2` to `9.0.4` --- .../notebooks/migrate_8_1_2_to_9_0_4.ipynb | 585 ++++++++++++++++++ 1 file changed, 585 insertions(+) create mode 100644 demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb new file mode 100644 index 00000000..f844271c --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -0,0 +1,585 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Migrate Mongo data from `nmdc-schema` [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2) to [`v9.0.4`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v9.0.4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Determine Mongo collections that will be transformed\n", + "\n", + "In this step, you will determine which Mongo collections will be transformed during this migration.\n", + "\n", + "1. In [`nmdc_schema/migration_recursion.py`](https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py), locate the Python class whose name reflects the initial and final version numbers of this migration.\n", + "2. In that Python class, locate the `self.agenda` dictionary.\n", + "3. In that dictionary, make a list of the keys—these are the names of the Mongo collections that will be transformed during this migration. For example:\n", + " ```py\n", + " self.agenda = dict(\n", + " collection_name_1=[self.some_function],\n", + " collection_name_2=[self.some_function],\n", + " )\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Coordinate with teammates that read/write to those collections\n", + "\n", + "In this step, you'll identify and reach out to the people that read/write to those collections; to agree on a migration schedule that works for you and them.\n", + "\n", + "Here's a table of Mongo collections and the components of the NMDC system that write to them (according to [a conversation that occurred on September 11, 2023](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK)).\n", + "\n", + "| Mongo collection | NMDC system components that write to it |\n", + "|---------------------------------------------|----------------------------------------------------------|\n", + "| `biosample_set` | Workflows (via manual entry via `nmdc-runtime` HTTP API) |\n", + "| `data_object_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `mags_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_annotation_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_assembly_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_based_taxonomy_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_qc_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `jobs` | Scheduler (via Mongo directly) |\n", + "| `*` | `nmdc-runtime` (via Mongo directly) |\n", + "\n", + "You can use that table to help determine which people read/write to those collections. You can then coordinate a migration time slot with them via Slack, email, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Setup a migration environment\n", + "\n", + "In this step, you'll set up an environment in which you can run this notebook.\n", + "\n", + "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n", + " 1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n", + " ```\n", + " > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use the `.notebook.env.example` file as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n", + " 1. You can use the `.mongo.yaml.example` file as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin Mongo server, use credentials that have write access to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends. You can do that by running this cell.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==9.0.4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the Python objects is a Python class that is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library packages:\n", + "from pathlib import Path\n", + "from shutil import rmtree\n", + "from copy import deepcopy\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", + "from jsonschema import Draft7Validator\n", + "from dictdiffer import diff\n", + "\n", + "# First-party packages:\n", + "from helpers import Config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmatically determine which collections will be transformed\n", + "\n", + "Here are the names of the collections this migration will transform.\n", + "\n", + "> Ensure you have coordinated with the people that read/write to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agenda_collection_names = Migrator().agenda.keys()\n", + "\n", + "print(\"The following collections will be transformed:\")\n", + "print(\"\\n\".join(agenda_collection_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the application paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!{mongodump} --version\n", + "!{mongorestore} --version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Mongo clients\n", + "\n", + "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo client for origin Mongo server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for transformer Mongo server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the Mongo clients' abilities to access their respective Mongo servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", + "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the origin database exists.\n", + "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", + "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the transformation database does not exist.\n", + "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "\n", + "> Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" Mongo server\n", + "\n", + "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n", + "\n", + "Since `mongodump` doesn't provide a CLI option that you can use to specify the collections you _want_ it to dump (unless that is only one collection), you can use a different CLI option to tell it all the collection you do _not_ want it to dump. The end result will be the same—there's just an extra step involved." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That extra step is to generate an `--excludeCollection=\"{name}\"` CLI option for each collection that is not on the agenda, which you'll do now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "\n", + "print(exclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongodump` command containing all those `--excludeCollection=\"{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the not-excluded collections from the origin database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the collections into the \"transformer\" Mongo server\n", + "\n", + "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n", + "\n", + "Since it's possible that the dump includes more collections than are on the agenda (due to someone creating a collection between the time you generated the exclusion list and the time you ran `mongodump`), you will use one or more of `mongorestore`'s `--nsInclude` CLI options to indicate which collections you want to load.\n", + "\n", + "Here's where you will generate the `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "\n", + "print(inclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongorestore` command containing all those `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Restore the dumped collections to the transformer MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" Mongo server\n", + "\n", + "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n", + "\n", + "The transformation functions are provided by the `nmdc-schema` Python package.\n", + "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n", + "\n", + "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n", + "\n", + "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", + "\n", + "> Note: This step also include a before-and-after comparison to facilitate manual spot checks. References: https://docs.python.org/3/library/copy.html#copy.deepcopy and https://dictdiffer.readthedocs.io/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "migrator = Migrator()\n", + "\n", + "# Apply the transformations.\n", + "for collection_name, transformation_pipeline in migrator.agenda.items():\n", + " print(f\"Transforming documents in collection: {collection_name}\")\n", + " transformed_documents = []\n", + "\n", + " # Get each document from this collection.\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for original_document in collection.find():\n", + " # Make a deep copy of the original document, to enable before-and-after comparison.\n", + " print(original_document)\n", + " copy_of_original_document = deepcopy(original_document)\n", + " \n", + " # Put the document through the transformation pipeline associated with this collection.\n", + " transformed_document = original_document # initializes the variable\n", + " for transformation_function in transformation_pipeline:\n", + " transformed_document = transformation_function(transformed_document)\n", + " print(transformed_document)\n", + " \n", + " # Compare the transformed document with a copy of the original document;\n", + " # and, if there are any differences, print those differences.\n", + " difference = diff(copy_of_original_document, transformed_document)\n", + " differences = list(difference)\n", + " if len(differences) > 0:\n", + " print(f\"✏️ {differences}\")\n", + "\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + "\n", + " # Store the transformed document.\n", + " transformed_documents.append(transformed_document) \n", + " print(\"\") \n", + "\n", + " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", + " for transformed_document in transformed_documents:\n", + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump the transformed collections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the transformer MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the transformed data into the \"origin\" Mongo server\n", + "\n", + "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that have the same name(s)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\ \n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Clean up\n", + "\n", + "Delete the temporary files and MongoDB dumps created by this notebook.\n", + "\n", + "> Note: You can skip this step, in case you want to delete them manually later (e.g. to examine them before deleting them)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "paths_to_files_to_delete = []\n", + "\n", + "paths_to_folders_to_delete = [\n", + " cfg.origin_dump_folder_path,\n", + " cfg.transformer_dump_folder_path,\n", + "]\n", + "\n", + "# Delete files.\n", + "for path in [Path(string) for string in paths_to_files_to_delete]:\n", + " try:\n", + " path.unlink()\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")\n", + "\n", + "# Delete folders.\n", + "for path in [Path(string) for string in paths_to_folders_to_delete]:\n", + " try:\n", + " rmtree(path)\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 45ec88773adf98df5cf1a8d749e3d6bd923e36bc Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 17:32:26 -0800 Subject: [PATCH 20/28] Update `nmdc-schema` package from `8.0.0` to `8.1.2` --- RELEASES.md | 2 ++ requirements/main.in | 2 +- requirements/main.txt | 36 +++++++++++++++++------------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/RELEASES.md b/RELEASES.md index 3b0ab860..64543208 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -13,6 +13,8 @@ Use to express the current date and tim time offset for New York on standard time (EST). "−08:00" would be for California. ## Release Log +* 2023-11-07T17:30:00-08:00 update nmdc-schema package from 8.0.0 to 8.1.2 +* (missing entries) * 2023-08-31T22:15:00-07:00 update nmdc-schema package from 7.7.2 to 7.8.0 * 2023-01-27T13:13:09-05:00 return 201 on activity creation * 2023-01-25T13:13:09-05:00 all typecodes for minter diff --git a/requirements/main.in b/requirements/main.in index 46551ccd..1e2fe070 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==8.0.0 +nmdc-schema==8.1.2 openpyxl pandas passlib[bcrypt] diff --git a/requirements/main.txt b/requirements/main.txt index 1ee0dc8b..2672f950 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -67,9 +67,9 @@ black==23.10.1 # via shed bleach==6.1.0 # via nbconvert -boto3==1.28.77 +boto3==1.28.80 # via -r requirements/main.in -botocore==1.31.77 +botocore==1.31.80 # via # boto3 # s3transfer @@ -113,7 +113,7 @@ coloredlogs==14.0 # via dagster com2ann==0.3.0 # via shed -comm==0.1.4 +comm==0.2.0 # via # ipykernel # ipywidgets @@ -121,7 +121,7 @@ croniter==2.0.1 # via dagster cryptography==41.0.5 # via python-jose -curies==0.7.0 +curies==0.7.2 # via linkml-runtime dagit==1.5.6 # via -r requirements/main.in @@ -254,8 +254,6 @@ ipython==8.17.2 # ipykernel # ipywidgets # jupyter-console -ipython-genutils==0.2.0 - # via qtconsole ipywidgets==8.1.1 # via jupyter isodate==0.6.1 @@ -287,7 +285,7 @@ jmespath==1.0.1 # botocore jq==1.6.0 # via -r requirements/main.in -jsbeautifier==1.14.9 +jsbeautifier==1.14.11 # via mkdocs-mermaid2-plugin json-flattener==0.1.9 # via linkml-runtime @@ -322,7 +320,7 @@ jsonschema-specifications==2023.7.1 # via jsonschema jupyter==1.0.0 # via -r requirements/main.in -jupyter-client==8.5.0 +jupyter-client==8.6.0 # via # ipykernel # jupyter-console @@ -342,11 +340,11 @@ jupyter-core==5.5.0 # nbconvert # nbformat # qtconsole -jupyter-events==0.8.0 +jupyter-events==0.9.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.9.1 +jupyter-server==2.10.0 # via # jupyter-lsp # jupyterlab @@ -373,13 +371,13 @@ lazy-model==0.2.0 # via beanie libcst==1.1.0 # via shed -linkml==1.6.1 +linkml==1.6.2 # via # -r requirements/main.in # nmdc-schema linkml-dataops==0.1.0 # via linkml -linkml-runtime==1.6.0 +linkml-runtime==1.6.1 # via # -r requirements/main.in # linkml @@ -421,7 +419,7 @@ mkdocs==1.5.3 # mkdocs-mermaid2-plugin mkdocs-jupyter==0.24.6 # via -r requirements/main.in -mkdocs-material==9.4.7 +mkdocs-material==9.4.8 # via # -r requirements/main.in # mkdocs-jupyter @@ -439,9 +437,9 @@ mypy-extensions==1.0.0 # via # black # typing-inspect -nbclient==0.8.0 +nbclient==0.9.0 # via nbconvert -nbconvert==7.10.0 +nbconvert==7.11.0 # via # jupyter # jupyter-server @@ -454,7 +452,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.5.8 # via ipykernel -nmdc-schema==8.0.0 +nmdc-schema==8.1.2 # via -r requirements/main.in notebook==7.0.6 # via jupyter @@ -657,7 +655,7 @@ pyzmq==25.1.1 # jupyter-console # jupyter-server # qtconsole -qtconsole==5.4.4 +qtconsole==5.5.0 # via jupyter qtpy==2.4.1 # via qtconsole @@ -716,7 +714,7 @@ rfc3986-validator==0.1.1 # jupyter-events rfc3987==1.3.8 # via jsonschema -rpds-py==0.10.6 +rpds-py==0.12.0 # via # jsonschema # referencing @@ -899,7 +897,7 @@ urllib3==1.26.18 # pyshex # requests # requests-cache -uvicorn==0.23.2 +uvicorn==0.24.0.post1 # via # -r requirements/main.in # dagster-webserver From 8f3a3c291abde912b99a762e2aa19e5a5cf29db8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 18:10:28 -0800 Subject: [PATCH 21/28] =?UTF-8?q?Delete=20rogue=20whitespace=20following?= =?UTF-8?q?=20line=20continuation=20character=20=F0=9F=98=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb index f844271c..d3fd259e 100644 --- a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -515,7 +515,7 @@ " --verbose \\\n", " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", " --drop \\\n", - " --preserveUUID \\ \n", + " --preserveUUID \\\n", " {inclusion_options_str}" ] }, From 2e554f9eeb8a19113a740b849120576aaa365222 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 18:15:23 -0800 Subject: [PATCH 22/28] Update `import` statement and clarify shell command context --- .../notebooks/migrate_8_1_2_to_9_0_4.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb index d3fd259e..d5f570cd 100644 --- a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -64,17 +64,20 @@ "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n", " 1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n", " ```shell\n", + " # Run in any directory:\n", " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n", " ```\n", " > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n", "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", " 1. You can use the `.notebook.env.example` file as a template:\n", " ```shell\n", + " # Run in the same directory as this notebook:\n", " $ cp .notebook.env.example .notebook.env\n", " ```\n", "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n", " 1. You can use the `.mongo.yaml.example` file as a template:\n", " ```shell\n", + " # Run in the same directory as this notebook:\n", " $ cp .mongo.yaml.example .mongo.origin.yaml\n", " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", " ```\n", @@ -134,7 +137,7 @@ "# Third-party packages:\n", "import pymongo\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", - "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", + "from nmdc_schema.migration_recursion import Migrator_from_8_1_to_9_0 as Migrator\n", "from jsonschema import Draft7Validator\n", "from dictdiffer import diff\n", "\n", From 78543552d463c874b8ccc82596a3967ab49e1166 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 19:29:59 -0800 Subject: [PATCH 23/28] Implement workarounds for issues nmdc-schema#1310 and nmdc-schema#1311 --- .../notebooks/migrate_8_1_2_to_9_0_4.ipynb | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb index d5f570cd..b2e91df3 100644 --- a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -4,7 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Migrate Mongo data from `nmdc-schema` [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2) to [`v9.0.4`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v9.0.4)" + "# Migrate Mongo data from `nmdc-schema` [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2) to [`v9.0.4`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v9.0.4)\n", + "\n", + "## Preface\n", + "\n", + "Download a copy of https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/study_dois_changes.yaml and save it at the path `assets/misc/study_dois_changes.yaml` relative to this notebook. This is a workaround for issue https://github.com/microbiomedata/nmdc-schema/issues/1310. " ] }, { @@ -439,7 +443,22 @@ " # Put the document through the transformation pipeline associated with this collection.\n", " transformed_document = original_document # initializes the variable\n", " for transformation_function in transformation_pipeline:\n", - " transformed_document = transformation_function(transformed_document)\n", + " #\n", + " # THIS IS A WORKAROUND FOR ISSUE https://github.com/microbiomedata/nmdc-schema/issues/1311\n", + " #\n", + " # Note: Some of the transformation functions in the migration class specific to this migration\n", + " # do not return the transformed dictionary. As a result, transformation functions\n", + " # \"further down the pipeline\" do not receive a dictionary as input.\n", + " # \n", + " # The workaround I have employed here is:\n", + " # 1. Manually read the transformation functions and verify they all do, indeed,\n", + " # modify the dictionary \"in place\" (as opposed to returning a copy).\n", + " # 2. Once that has been verified; replace the standard notebook code\n", + " # (commented-out below) with code that ignores the return value\n", + " # of the transformation functions.\n", + " #\n", + " # transformed_document = transformation_function(transformed_document)\n", + " transformation_function(transformed_document)\n", " print(transformed_document)\n", " \n", " # Compare the transformed document with a copy of the original document;\n", From 1d2039cf3af0f665d68a8812ac360eb6937ee647 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 19:40:05 -0800 Subject: [PATCH 24/28] Update `nmdc-schema` package from `8.1.2` to `9.0.4` --- RELEASES.md | 1 + requirements/main.in | 2 +- requirements/main.txt | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/RELEASES.md b/RELEASES.md index 64543208..aae3ffba 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -13,6 +13,7 @@ Use to express the current date and tim time offset for New York on standard time (EST). "−08:00" would be for California. ## Release Log +* 2023-11-07T19:30:00-08:00 update nmdc-schema package from 8.1.2 to 9.0.4 * 2023-11-07T17:30:00-08:00 update nmdc-schema package from 8.0.0 to 8.1.2 * (missing entries) * 2023-08-31T22:15:00-07:00 update nmdc-schema package from 7.7.2 to 7.8.0 diff --git a/requirements/main.in b/requirements/main.in index 1e2fe070..dc99d611 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==8.1.2 +nmdc-schema==9.0.4 openpyxl pandas passlib[bcrypt] diff --git a/requirements/main.txt b/requirements/main.txt index 2672f950..c6175d7c 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -452,7 +452,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.5.8 # via ipykernel -nmdc-schema==8.1.2 +nmdc-schema==9.0.4 # via -r requirements/main.in notebook==7.0.6 # via jupyter @@ -721,7 +721,9 @@ rpds-py==0.12.0 rsa==4.9 # via python-jose ruamel-yaml==0.18.5 - # via linkml-dataops + # via + # linkml-dataops + # nmdc-schema ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.7.0 From 41a8b6cde86969efd983ce583345d9cea6bf6760 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 7 Nov 2023 19:50:32 -0800 Subject: [PATCH 25/28] Update `nmdc-schema` package from `9.0.4` to `9.1.0` --- RELEASES.md | 1 + requirements/main.in | 2 +- requirements/main.txt | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/RELEASES.md b/RELEASES.md index aae3ffba..92e9b802 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -13,6 +13,7 @@ Use to express the current date and tim time offset for New York on standard time (EST). "−08:00" would be for California. ## Release Log +* 2023-11-07T19:45:00-08:00 update nmdc-schema package from 9.0.4 to 9.1.0 * 2023-11-07T19:30:00-08:00 update nmdc-schema package from 8.1.2 to 9.0.4 * 2023-11-07T17:30:00-08:00 update nmdc-schema package from 8.0.0 to 8.1.2 * (missing entries) diff --git a/requirements/main.in b/requirements/main.in index dc99d611..067940de 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==9.0.4 +nmdc-schema==9.1.0 openpyxl pandas passlib[bcrypt] diff --git a/requirements/main.txt b/requirements/main.txt index c6175d7c..5fba6141 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -452,7 +452,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.5.8 # via ipykernel -nmdc-schema==9.0.4 +nmdc-schema==9.1.0 # via -r requirements/main.in notebook==7.0.6 # via jupyter From 903f5c318e088187956f828eaa8aed6298d2189e Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 13 Nov 2023 10:40:49 -0500 Subject: [PATCH 26/28] fix: include mongo-command writeErrors for http response code (#375) * fix: include mongo-command update_info writeErrors when determining response code fixes #326 * fix: default to empty dict --- nmdc_runtime/api/core/metadata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nmdc_runtime/api/core/metadata.py b/nmdc_runtime/api/core/metadata.py index 41a5725b..8f411f25 100644 --- a/nmdc_runtime/api/core/metadata.py +++ b/nmdc_runtime/api/core/metadata.py @@ -733,6 +733,11 @@ def _validate_changesheet(df_change: pd.DataFrame, mdb: MongoDatabase): for result in results_of_updates: if len(result.get("validation_errors", [])) > 0: validation_errors.append(result["validation_errors"]) + if ( + len(write_errors := result.get("update_info", {}).get("writeErrors", {})) + > 0 + ): + validation_errors.append(write_errors) if validation_errors: raise HTTPException( From fd3aee97ae3481e1fc8eec4b650f29a386848cc2 Mon Sep 17 00:00:00 2001 From: Brynn Zalmanek <46435419+brynnz22@users.noreply.github.com> Date: Mon, 13 Nov 2023 08:11:41 -0800 Subject: [PATCH 27/28] add checksum timestamp changes (#371) * add checksum timestamp changes * style: black reformat * style: DRY * add unit test --------- Co-authored-by: Donny Winston --- nmdc_runtime/api/core/util.py | 5 ++++- nmdc_runtime/api/endpoints/util.py | 6 +++++- nmdc_runtime/util.py | 4 ++-- tests/unit/core_util.py | 21 +++++++++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 tests/unit/core_util.py diff --git a/nmdc_runtime/api/core/util.py b/nmdc_runtime/api/core/util.py index 48d15ed1..ad97471f 100644 --- a/nmdc_runtime/api/core/util.py +++ b/nmdc_runtime/api/core/util.py @@ -28,10 +28,13 @@ def hash_from_str(s: str, algo="sha256") -> str: return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest() -def sha256hash_from_file(file_path: str): +def sha256hash_from_file(file_path: str, timestamp: str): # https://stackoverflow.com/a/55542529 h = hashlib.sha256() + timestamp_bytes = timestamp.encode("utf-8") + h.update(timestamp_bytes) + with open(file_path, "rb") as file: while True: # Reading is buffered, so we can read smaller chunks. diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index fd473682..f8279efb 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -431,6 +431,9 @@ def persist_content_and_get_drs_object( filepath = str(Path(save_dir).joinpath(filename)) with open(filepath, "w") as f: f.write(content) + now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat( + timespec="minutes" + ) object_in = DrsObjectIn( **drs_metadata_for( filepath, @@ -438,10 +441,11 @@ def persist_content_and_get_drs_object( "description": ( description + f" (created by/for {username}" - + f" at {datetime.now(tz=ZoneInfo('America/Los_Angeles')).isoformat(timespec='minutes')})" + + f" at {now_to_the_minute})" ), "access_methods": [{"access_id": drs_id}], }, + timestamp=now_to_the_minute, ) ) self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}" diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 23e94dd1..475a98d4 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -82,7 +82,7 @@ def put_object(filepath, url, mime_type=None): return requests.put(url, data=f, headers={"Content-Type": mime_type}) -def drs_metadata_for(filepath, base=None): +def drs_metadata_for(filepath, base=None, timestamp=None): """given file path, get drs metadata required: size, created_time, and at least one checksum. @@ -96,7 +96,7 @@ def drs_metadata_for(filepath, base=None): ) if "checksums" not in base: base["checksums"] = [ - {"type": "sha256", "checksum": sha256hash_from_file(filepath)} + {"type": "sha256", "checksum": sha256hash_from_file(filepath, timestamp)} ] if "mime_type" not in base: base["mime_type"] = mimetypes.guess_type(filepath)[0] diff --git a/tests/unit/core_util.py b/tests/unit/core_util.py new file mode 100644 index 00000000..0fbd2873 --- /dev/null +++ b/tests/unit/core_util.py @@ -0,0 +1,21 @@ +from datetime import datetime, timedelta +from pathlib import Path +from zoneinfo import ZoneInfo + +from nmdc_runtime.api.core.util import sha256hash_from_file + +TEST_FILES_DIR = Path(__file__).parent.parent.joinpath("files") + + +def test_sha256hash_from_file_is_timestamp_dependent(): + file_path = str(TEST_FILES_DIR.joinpath("test_changesheet_update_one_ph.tsv")) + ts_1 = datetime.now(tz=ZoneInfo("America/Los_Angeles")) + ts_2 = ts_1 + timedelta(minutes=1) + hashes = [] + for ts in (ts_1, ts_2): + hashes.append( + sha256hash_from_file( + file_path=file_path, timestamp=ts.isoformat(timespec="minutes") + ) + ) + assert hashes[0] != hashes[1] From 1eacc43922104be514d626ac4831114f4378d2e1 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 13 Nov 2023 13:36:17 -0500 Subject: [PATCH 28/28] fix: coerce pydantic AnyUri to str when needed (#377) fixes #376 --- nmdc_runtime/site/resources.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 5f857a77..22b22d6d 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -17,7 +17,7 @@ from fastjsonschema import JsonSchemaValueException from frozendict import frozendict from linkml_runtime.dumpers import json_dumper -from pydantic import BaseModel +from pydantic import BaseModel, AnyUrl from pymongo import MongoClient, ReplaceOne, InsertOne from terminusdb_client import WOQLClient from toolz import get_in @@ -194,15 +194,17 @@ def get_object_bytes(self, object_id) -> requests.Response: access = AccessURL( **self.get_object_access(object_id, method.access_id).json() ) - if access.url.startswith( + if str(access.url).startswith( os.getenv("API_HOST_EXTERNAL") ) and self.base_url == os.getenv("API_HOST"): - access.url = access.url.replace( - os.getenv("API_HOST_EXTERNAL"), os.getenv("API_HOST") + access.url = AnyUrl( + str(access.url).replace( + os.getenv("API_HOST_EXTERNAL"), os.getenv("API_HOST") + ) ) else: access = AccessURL(url=method.access_url.url) - return requests.get(access.url) + return requests.get(str(access.url)) def list_jobs(self, list_request=None): if list_request is None: