diff --git a/.github/workflows/build-and-push-docker-images.yml b/.github/workflows/build-and-push-docker-images.yml index 32be4213..5ef44f46 100644 --- a/.github/workflows/build-and-push-docker-images.yml +++ b/.github/workflows/build-and-push-docker-images.yml @@ -1,8 +1,15 @@ name: build-and-push-docker-images on: + workflow_dispatch: push: branches: - main + paths: + - '.github/workflows/build-and-push-docker-images.yml' + - 'Makefile' + - '**.Dockerfile' + - '**.py' + - 'requirements/main.txt' jobs: docker: diff --git a/.github/workflows/release-to-pypi.yml b/.github/workflows/release-to-pypi.yml new file mode 100644 index 00000000..3c8fec18 --- /dev/null +++ b/.github/workflows/release-to-pypi.yml @@ -0,0 +1,30 @@ +name: Publish Python Package + +on: + release: + types: [created] + +jobs: + build-n-publish: + name: Build and publish Python 🐍 distributions 📦 to PyPI + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Build source and wheel archives + run: | + make init + python -m build + + - name: Publish distribution 📦 to PyPI + if: github.repository == 'microbiomedata/nmdc-runtime' + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_PASSWORD }} diff --git a/README.md b/README.md index 103df781..6fc163df 100644 --- a/README.md +++ b/README.md @@ -144,3 +144,12 @@ desired and does not break over time. [For hints on how to write tests for solids and pipelines in Dagster, see their documentation tutorial on Testing](https://docs.dagster.io/tutorial/testable). + + +## Release to PyPI + +``` +rm -rf dist +python -m build +twine upload dist/* +``` \ No newline at end of file diff --git a/RELEASES.md b/RELEASES.md index 3b0ab860..92e9b802 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -13,6 +13,10 @@ Use to express the current date and tim time offset for New York on standard time (EST). "−08:00" would be for California. ## Release Log +* 2023-11-07T19:45:00-08:00 update nmdc-schema package from 9.0.4 to 9.1.0 +* 2023-11-07T19:30:00-08:00 update nmdc-schema package from 8.1.2 to 9.0.4 +* 2023-11-07T17:30:00-08:00 update nmdc-schema package from 8.0.0 to 8.1.2 +* (missing entries) * 2023-08-31T22:15:00-07:00 update nmdc-schema package from 7.7.2 to 7.8.0 * 2023-01-27T13:13:09-05:00 return 201 on activity creation * 2023-01-25T13:13:09-05:00 all typecodes for minter diff --git a/components/nmdc_runtime/workflow_execution_activity/core.py b/components/nmdc_runtime/workflow_execution_activity/core.py index 8a550e80..43236234 100644 --- a/components/nmdc_runtime/workflow_execution_activity/core.py +++ b/components/nmdc_runtime/workflow_execution_activity/core.py @@ -94,7 +94,7 @@ def insert_into_keys( workflow: Workflow, data_objects: list[DataObject] ) -> dict[str, Any]: """Insert data object url into correct workflow input field.""" - workflow_dict = workflow.dict() + workflow_dict = workflow.model_dump() for key in workflow_dict["inputs"]: for do in data_objects: if workflow_dict["inputs"][key] == str(do.data_object_type): diff --git a/demo/metadata_migration/notebooks/.mongo.yaml.example b/demo/metadata_migration/notebooks/.mongo.yaml.example index a596e1f1..a1df2e02 100644 --- a/demo/metadata_migration/notebooks/.mongo.yaml.example +++ b/demo/metadata_migration/notebooks/.mongo.yaml.example @@ -15,6 +15,14 @@ # mongodb://root:pass@localhost:27017/?authSource=admin # ``` # +# Example: +# Assuming the same scenario as in the previous example, but without +# access control enabled (i.e. no username/password), +# the value of `uri` would be: +# ``` +# mongodb://localhost:27017/ +# ``` +# # Reference: # https://www.mongodb.com/docs/database-tools/mongodump/#std-option-mongodump.--uri # diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb new file mode 100644 index 00000000..67ee6038 --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb @@ -0,0 +1,585 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Migrate Mongo data from `nmdc-schema` [`v8.0.0`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.0.0) to [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Determine Mongo collections that will be transformed\n", + "\n", + "In this step, you will determine which Mongo collections will be transformed during this migration.\n", + "\n", + "1. In [`nmdc_schema/migration_recursion.py`](https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py), locate the Python class whose name reflects the initial and final version numbers of this migration.\n", + "2. In that Python class, locate the `self.agenda` dictionary.\n", + "3. In that dictionary, make a list of the keys—these are the names of the Mongo collections that will be transformed during this migration. For example:\n", + " ```py\n", + " self.agenda = dict(\n", + " collection_name_1=[self.some_function],\n", + " collection_name_2=[self.some_function],\n", + " )\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Coordinate with teammates that read/write to those collections\n", + "\n", + "In this step, you'll identify and reach out to the people that read/write to those collections; to agree on a migration schedule that works for you and them.\n", + "\n", + "Here's a table of Mongo collections and the components of the NMDC system that write to them (according to [a conversation that occurred on September 11, 2023](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK)).\n", + "\n", + "| Mongo collection | NMDC system components that write to it |\n", + "|---------------------------------------------|----------------------------------------------------------|\n", + "| `biosample_set` | Workflows (via manual entry via `nmdc-runtime` HTTP API) |\n", + "| `data_object_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `mags_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_annotation_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_assembly_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_based_taxonomy_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_qc_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `jobs` | Scheduler (via Mongo directly) |\n", + "| `*` | `nmdc-runtime` (via Mongo directly) |\n", + "\n", + "You can use that table to help determine which people read/write to those collections. You can then coordinate a migration time slot with them via Slack, email, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Setup a migration environment\n", + "\n", + "In this step, you'll set up an environment in which you can run this notebook.\n", + "\n", + "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n", + " 1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n", + " ```\n", + " > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use the `.notebook.env.example` file as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n", + " 1. You can use the `.mongo.yaml.example` file as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin Mongo server, use credentials that have write access to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends. You can do that by running this cell.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==8.1.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the Python objects is a Python class that is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library packages:\n", + "from pathlib import Path\n", + "from shutil import rmtree\n", + "from copy import deepcopy\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migration_recursion import Migrator_from_8_0_0_to_8_1_0 as Migrator\n", + "from jsonschema import Draft7Validator\n", + "from dictdiffer import diff\n", + "\n", + "# First-party packages:\n", + "from helpers import Config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmatically determine which collections will be transformed\n", + "\n", + "Here are the names of the collections this migration will transform.\n", + "\n", + "> Ensure you have coordinated with the people that read/write to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agenda_collection_names = Migrator().agenda.keys()\n", + "\n", + "print(\"The following collections will be transformed:\")\n", + "print(\"\\n\".join(agenda_collection_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the application paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!{mongodump} --version\n", + "!{mongorestore} --version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Mongo clients\n", + "\n", + "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo client for origin Mongo server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for transformer Mongo server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the Mongo clients' abilities to access their respective Mongo servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", + "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the origin database exists.\n", + "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", + "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the transformation database does not exist.\n", + "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "\n", + "> Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" Mongo server\n", + "\n", + "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n", + "\n", + "Since `mongodump` doesn't provide a CLI option that you can use to specify the collections you _want_ it to dump (unless that is only one collection), you can use a different CLI option to tell it all the collection you do _not_ want it to dump. The end result will be the same—there's just an extra step involved." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That extra step is to generate an `--excludeCollection=\"{name}\"` CLI option for each collection that is not on the agenda, which you'll do now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "\n", + "print(exclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongodump` command containing all those `--excludeCollection=\"{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the not-excluded collections from the origin database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the collections into the \"transformer\" Mongo server\n", + "\n", + "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n", + "\n", + "Since it's possible that the dump includes more collections than are on the agenda (due to someone creating a collection between the time you generated the exclusion list and the time you ran `mongodump`), you will use one or more of `mongorestore`'s `--nsInclude` CLI options to indicate which collections you want to load.\n", + "\n", + "Here's where you will generate the `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "\n", + "print(inclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongorestore` command containing all those `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Restore the dumped collections to the transformer MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" Mongo server\n", + "\n", + "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n", + "\n", + "The transformation functions are provided by the `nmdc-schema` Python package.\n", + "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n", + "\n", + "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n", + "\n", + "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", + "\n", + "> Note: This step also include a before-and-after comparison to facilitate manual spot checks. References: https://docs.python.org/3/library/copy.html#copy.deepcopy and https://dictdiffer.readthedocs.io/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "migrator = Migrator()\n", + "\n", + "# Apply the transformations.\n", + "for collection_name, transformation_pipeline in migrator.agenda.items():\n", + " print(f\"Transforming documents in collection: {collection_name}\")\n", + " transformed_documents = []\n", + "\n", + " # Get each document from this collection.\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for original_document in collection.find():\n", + " # Make a deep copy of the original document, to enable before-and-after comparison.\n", + " print(original_document)\n", + " copy_of_original_document = deepcopy(original_document)\n", + " \n", + " # Put the document through the transformation pipeline associated with this collection.\n", + " transformed_document = original_document # initializes the variable\n", + " for transformation_function in transformation_pipeline:\n", + " transformed_document = transformation_function(transformed_document)\n", + " print(transformed_document)\n", + " \n", + " # Compare the transformed document with a copy of the original document;\n", + " # and, if there are any differences, print those differences.\n", + " difference = diff(copy_of_original_document, transformed_document)\n", + " differences = list(difference)\n", + " if len(differences) > 0:\n", + " print(f\"✏️ {differences}\")\n", + "\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + "\n", + " # Store the transformed document.\n", + " transformed_documents.append(transformed_document) \n", + " print(\"\") \n", + "\n", + " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", + " for transformed_document in transformed_documents:\n", + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump the transformed collections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the transformer MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the transformed data into the \"origin\" Mongo server\n", + "\n", + "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that have the same name(s)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\ \n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Clean up\n", + "\n", + "Delete the temporary files and MongoDB dumps created by this notebook.\n", + "\n", + "> Note: You can skip this step, in case you want to delete them manually later (e.g. to examine them before deleting them)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "paths_to_files_to_delete = []\n", + "\n", + "paths_to_folders_to_delete = [\n", + " cfg.origin_dump_folder_path,\n", + " cfg.transformer_dump_folder_path,\n", + "]\n", + "\n", + "# Delete files.\n", + "for path in [Path(string) for string in paths_to_files_to_delete]:\n", + " try:\n", + " path.unlink()\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")\n", + "\n", + "# Delete folders.\n", + "for path in [Path(string) for string in paths_to_folders_to_delete]:\n", + " try:\n", + " rmtree(path)\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb new file mode 100644 index 00000000..b2e91df3 --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb @@ -0,0 +1,607 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Migrate Mongo data from `nmdc-schema` [`v8.1.2`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v8.1.2) to [`v9.0.4`](https://github.com/microbiomedata/nmdc-schema/releases/tag/v9.0.4)\n", + "\n", + "## Preface\n", + "\n", + "Download a copy of https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/study_dois_changes.yaml and save it at the path `assets/misc/study_dois_changes.yaml` relative to this notebook. This is a workaround for issue https://github.com/microbiomedata/nmdc-schema/issues/1310. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Determine Mongo collections that will be transformed\n", + "\n", + "In this step, you will determine which Mongo collections will be transformed during this migration.\n", + "\n", + "1. In [`nmdc_schema/migration_recursion.py`](https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py), locate the Python class whose name reflects the initial and final version numbers of this migration.\n", + "2. In that Python class, locate the `self.agenda` dictionary.\n", + "3. In that dictionary, make a list of the keys—these are the names of the Mongo collections that will be transformed during this migration. For example:\n", + " ```py\n", + " self.agenda = dict(\n", + " collection_name_1=[self.some_function],\n", + " collection_name_2=[self.some_function],\n", + " )\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Coordinate with teammates that read/write to those collections\n", + "\n", + "In this step, you'll identify and reach out to the people that read/write to those collections; to agree on a migration schedule that works for you and them.\n", + "\n", + "Here's a table of Mongo collections and the components of the NMDC system that write to them (according to [a conversation that occurred on September 11, 2023](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK)).\n", + "\n", + "| Mongo collection | NMDC system components that write to it |\n", + "|---------------------------------------------|----------------------------------------------------------|\n", + "| `biosample_set` | Workflows (via manual entry via `nmdc-runtime` HTTP API) |\n", + "| `data_object_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `mags_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_annotation_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `metagenome_assembly_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_based_taxonomy_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `read_qc_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API) |\n", + "| `jobs` | Scheduler (via Mongo directly) |\n", + "| `*` | `nmdc-runtime` (via Mongo directly) |\n", + "\n", + "You can use that table to help determine which people read/write to those collections. You can then coordinate a migration time slot with them via Slack, email, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Setup a migration environment\n", + "\n", + "In this step, you'll set up an environment in which you can run this notebook.\n", + "\n", + "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n", + " 1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n", + " ```shell\n", + " # Run in any directory:\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n", + " ```\n", + " > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use the `.notebook.env.example` file as a template:\n", + " ```shell\n", + " # Run in the same directory as this notebook:\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n", + " 1. You can use the `.mongo.yaml.example` file as a template:\n", + " ```shell\n", + " # Run in the same directory as this notebook:\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin Mongo server, use credentials that have write access to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends. You can do that by running this cell.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==9.0.4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the Python objects is a Python class that is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library packages:\n", + "from pathlib import Path\n", + "from shutil import rmtree\n", + "from copy import deepcopy\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migration_recursion import Migrator_from_8_1_to_9_0 as Migrator\n", + "from jsonschema import Draft7Validator\n", + "from dictdiffer import diff\n", + "\n", + "# First-party packages:\n", + "from helpers import Config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmatically determine which collections will be transformed\n", + "\n", + "Here are the names of the collections this migration will transform.\n", + "\n", + "> Ensure you have coordinated with the people that read/write to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agenda_collection_names = Migrator().agenda.keys()\n", + "\n", + "print(\"The following collections will be transformed:\")\n", + "print(\"\\n\".join(agenda_collection_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the application paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!{mongodump} --version\n", + "!{mongorestore} --version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Mongo clients\n", + "\n", + "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo client for origin Mongo server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for transformer Mongo server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a sanity test of the Mongo clients' abilities to access their respective Mongo servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the Mongo server version (running on the \"origin\" Mongo server).\n", + "print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the origin database exists.\n", + "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n", + "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + "# Sanity test: Ensure the transformation database does not exist.\n", + "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "\n", + "> Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" Mongo server\n", + "\n", + "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n", + "\n", + "Since `mongodump` doesn't provide a CLI option that you can use to specify the collections you _want_ it to dump (unless that is only one collection), you can use a different CLI option to tell it all the collection you do _not_ want it to dump. The end result will be the same—there's just an extra step involved." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That extra step is to generate an `--excludeCollection=\"{name}\"` CLI option for each collection that is not on the agenda, which you'll do now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "\n", + "print(exclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongodump` command containing all those `--excludeCollection=\"{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the not-excluded collections from the origin database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the collections into the \"transformer\" Mongo server\n", + "\n", + "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n", + "\n", + "Since it's possible that the dump includes more collections than are on the agenda (due to someone creating a collection between the time you generated the exclusion list and the time you ran `mongodump`), you will use one or more of `mongorestore`'s `--nsInclude` CLI options to indicate which collections you want to load.\n", + "\n", + "Here's where you will generate the `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "\n", + "print(inclusion_options_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, you'll run a `mongorestore` command containing all those `--nsInclude=\"nmdc.{name}\"` CLI options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Restore the dumped collections to the transformer MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" Mongo server\n", + "\n", + "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n", + "\n", + "The transformation functions are provided by the `nmdc-schema` Python package.\n", + "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n", + "\n", + "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n", + "\n", + "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", + "\n", + "> Note: This step also include a before-and-after comparison to facilitate manual spot checks. References: https://docs.python.org/3/library/copy.html#copy.deepcopy and https://dictdiffer.readthedocs.io/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "migrator = Migrator()\n", + "\n", + "# Apply the transformations.\n", + "for collection_name, transformation_pipeline in migrator.agenda.items():\n", + " print(f\"Transforming documents in collection: {collection_name}\")\n", + " transformed_documents = []\n", + "\n", + " # Get each document from this collection.\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for original_document in collection.find():\n", + " # Make a deep copy of the original document, to enable before-and-after comparison.\n", + " print(original_document)\n", + " copy_of_original_document = deepcopy(original_document)\n", + " \n", + " # Put the document through the transformation pipeline associated with this collection.\n", + " transformed_document = original_document # initializes the variable\n", + " for transformation_function in transformation_pipeline:\n", + " #\n", + " # THIS IS A WORKAROUND FOR ISSUE https://github.com/microbiomedata/nmdc-schema/issues/1311\n", + " #\n", + " # Note: Some of the transformation functions in the migration class specific to this migration\n", + " # do not return the transformed dictionary. As a result, transformation functions\n", + " # \"further down the pipeline\" do not receive a dictionary as input.\n", + " # \n", + " # The workaround I have employed here is:\n", + " # 1. Manually read the transformation functions and verify they all do, indeed,\n", + " # modify the dictionary \"in place\" (as opposed to returning a copy).\n", + " # 2. Once that has been verified; replace the standard notebook code\n", + " # (commented-out below) with code that ignores the return value\n", + " # of the transformation functions.\n", + " #\n", + " # transformed_document = transformation_function(transformed_document)\n", + " transformation_function(transformed_document)\n", + " print(transformed_document)\n", + " \n", + " # Compare the transformed document with a copy of the original document;\n", + " # and, if there are any differences, print those differences.\n", + " difference = diff(copy_of_original_document, transformed_document)\n", + " differences = list(difference)\n", + " if len(differences) > 0:\n", + " print(f\"✏️ {differences}\")\n", + "\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + "\n", + " # Store the transformed document.\n", + " transformed_documents.append(transformed_document) \n", + " print(\"\") \n", + "\n", + " # Replace the original documents with the transformed versions of themselves (in the transformer database).\n", + " for transformed_document in transformed_documents:\n", + " collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dump the transformed collections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the transformer MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the transformed data into the \"origin\" Mongo server\n", + "\n", + "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that have the same name(s)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Clean up\n", + "\n", + "Delete the temporary files and MongoDB dumps created by this notebook.\n", + "\n", + "> Note: You can skip this step, in case you want to delete them manually later (e.g. to examine them before deleting them)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "paths_to_files_to_delete = []\n", + "\n", + "paths_to_folders_to_delete = [\n", + " cfg.origin_dump_folder_path,\n", + " cfg.transformer_dump_folder_path,\n", + "]\n", + "\n", + "# Delete files.\n", + "for path in [Path(string) for string in paths_to_files_to_delete]:\n", + " try:\n", + " path.unlink()\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")\n", + "\n", + "# Delete folders.\n", + "for path in [Path(string) for string in paths_to_folders_to_delete]:\n", + " try:\n", + " rmtree(path)\n", + " print(f\"Deleted: {path}\")\n", + " except:\n", + " print(f\"Failed to delete: {path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 4944b3bd..4125ed9f 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -1,3 +1,5 @@ +dictdiffer==0.9.0 +jsonschema==4.19.2 pymongo==4.5.0 python-dotenv==1.0.0 PyYAML==6.0.1 \ No newline at end of file diff --git a/nmdc_runtime/api/core/metadata.py b/nmdc_runtime/api/core/metadata.py index 41a5725b..8f411f25 100644 --- a/nmdc_runtime/api/core/metadata.py +++ b/nmdc_runtime/api/core/metadata.py @@ -733,6 +733,11 @@ def _validate_changesheet(df_change: pd.DataFrame, mdb: MongoDatabase): for result in results_of_updates: if len(result.get("validation_errors", [])) > 0: validation_errors.append(result["validation_errors"]) + if ( + len(write_errors := result.get("update_info", {}).get("writeErrors", {})) + > 0 + ): + validation_errors.append(write_errors) if validation_errors: raise HTTPException( diff --git a/nmdc_runtime/api/core/util.py b/nmdc_runtime/api/core/util.py index 5aad716d..ad97471f 100644 --- a/nmdc_runtime/api/core/util.py +++ b/nmdc_runtime/api/core/util.py @@ -28,10 +28,13 @@ def hash_from_str(s: str, algo="sha256") -> str: return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest() -def sha256hash_from_file(file_path: str): +def sha256hash_from_file(file_path: str, timestamp: str): # https://stackoverflow.com/a/55542529 h = hashlib.sha256() + timestamp_bytes = timestamp.encode("utf-8") + h.update(timestamp_bytes) + with open(file_path, "rb") as file: while True: # Reading is buffered, so we can read smaller chunks. @@ -98,6 +101,6 @@ def generate_secret(length=12): def json_clean(data, model, exclude_unset=False) -> dict: """Run data through a JSON serializer for a pydantic model.""" if not isinstance(data, (dict, BaseModel)): - raise TypeError("`data` must be a pydantic model or its .dict()") + raise TypeError("`data` must be a pydantic model or its .model_dump()") m = model(**data) if isinstance(data, dict) else data return json.loads(m.json(exclude_unset=exclude_unset)) diff --git a/nmdc_runtime/api/endpoints/objects.py b/nmdc_runtime/api/endpoints/objects.py index 13cfa3aa..706f9049 100644 --- a/nmdc_runtime/api/endpoints/objects.py +++ b/nmdc_runtime/api/endpoints/objects.py @@ -78,7 +78,7 @@ def create_object( """ id_supplied = supplied_object_id( - mdb, client_site, object_in.dict(exclude_unset=True) + mdb, client_site, object_in.model_dump(exclude_unset=True) ) drs_id = local_part( id_supplied if id_supplied is not None else generate_one_id(mdb, S3_ID_NS) @@ -255,7 +255,7 @@ def update_object( status_code=status.HTTP_403_FORBIDDEN, detail=f"client authorized for different site_id than {object_mgr_site}", ) - doc_object_patched = merge(doc, object_patch.dict(exclude_unset=True)) + doc_object_patched = merge(doc, object_patch.model_dump(exclude_unset=True)) mdb.operations.replace_one({"id": object_id}, doc_object_patched) return doc_object_patched diff --git a/nmdc_runtime/api/endpoints/operations.py b/nmdc_runtime/api/endpoints/operations.py index 1f1f67b3..c6bcccf2 100644 --- a/nmdc_runtime/api/endpoints/operations.py +++ b/nmdc_runtime/api/endpoints/operations.py @@ -61,12 +61,16 @@ def update_operation( detail=f"client authorized for different site_id than {site_id_op}", ) op_patch_metadata = merge( - op_patch.dict(exclude_unset=True).get("metadata", {}), + op_patch.model_dump(exclude_unset=True).get("metadata", {}), pick(["site_id", "job", "model"], doc_op.get("metadata", {})), ) doc_op_patched = merge( doc_op, - assoc(op_patch.dict(exclude_unset=True), "metadata", op_patch_metadata), + assoc( + op_patch.model_dump(exclude_unset=True), + "metadata", + op_patch_metadata, + ), ) mdb.operations.replace_one({"id": op_id}, doc_op_patched) return doc_op_patched diff --git a/nmdc_runtime/api/endpoints/queries.py b/nmdc_runtime/api/endpoints/queries.py index 4e1c49c9..3d57166a 100644 --- a/nmdc_runtime/api/endpoints/queries.py +++ b/nmdc_runtime/api/endpoints/queries.py @@ -75,9 +75,9 @@ def run_query( id=qid, saved_at=saved_at, ) - mdb.queries.insert_one(query.dict(exclude_unset=True)) + mdb.queries.insert_one(query.model_dump(exclude_unset=True)) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.dict(exclude_unset=True)) + return unmongo(cmd_response.model_dump(exclude_unset=True)) @router.get("/queries/{query_id}", response_model=Query) @@ -107,7 +107,7 @@ def rerun_query( check_can_delete(user) cmd_response = _run_query(query, mdb) - return unmongo(cmd_response.dict(exclude_unset=True)) + return unmongo(cmd_response.model_dump(exclude_unset=True)) def _run_query(query, mdb) -> CommandResponse: @@ -131,12 +131,12 @@ def _run_query(query, mdb) -> CommandResponse: detail="Failed to back up to-be-deleted documents. operation aborted.", ) - q_response = mdb.command(query.cmd.dict(exclude_unset=True)) + q_response = mdb.command(query.cmd.model_dump(exclude_unset=True)) cmd_response: CommandResponse = command_response_for(q_type)(**q_response) query_run = ( QueryRun(qid=query.id, ran_at=ran_at, result=cmd_response) if cmd_response.ok else QueryRun(qid=query.id, ran_at=ran_at, error=cmd_response) ) - mdb.query_runs.insert_one(query_run.dict(exclude_unset=True)) + mdb.query_runs.insert_one(query_run.model_dump(exclude_unset=True)) return cmd_response diff --git a/nmdc_runtime/api/endpoints/runs.py b/nmdc_runtime/api/endpoints/runs.py index c49b7800..7c41ad84 100644 --- a/nmdc_runtime/api/endpoints/runs.py +++ b/nmdc_runtime/api/endpoints/runs.py @@ -94,5 +94,5 @@ def post_run_event( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Supplied run_event.run.id does not match run_id given in request URL.", ) - mdb.run_events.insert_one(run_event.dict()) + mdb.run_events.insert_one(run_event.model_dump()) return _get_run_summary(run_event.run.id, mdb) diff --git a/nmdc_runtime/api/endpoints/search.py b/nmdc_runtime/api/endpoints/search.py index 5fe80d2c..b48c411e 100644 --- a/nmdc_runtime/api/endpoints/search.py +++ b/nmdc_runtime/api/endpoints/search.py @@ -25,7 +25,7 @@ def data_objects( req: DataObjectListRequest = Depends(), mdb: MongoDatabase = Depends(get_mongo_db), ): - filter_ = list_request_filter_to_mongo_filter(req.dict(exclude_unset=True)) + filter_ = list_request_filter_to_mongo_filter(req.model_dump(exclude_unset=True)) max_page_size = filter_.pop("max_page_size", None) page_token = filter_.pop("page_token", None) req = ListRequest( diff --git a/nmdc_runtime/api/endpoints/sites.py b/nmdc_runtime/api/endpoints/sites.py index f63fd993..9e587d03 100644 --- a/nmdc_runtime/api/endpoints/sites.py +++ b/nmdc_runtime/api/endpoints/sites.py @@ -56,7 +56,7 @@ def create_site( status_code=status.HTTP_409_CONFLICT, detail=f"site with supplied id {site.id} already exists", ) - mdb.sites.insert_one(site.dict()) + mdb.sites.insert_one(site.model_dump()) refresh_minter_requesters_from_sites() rv = mdb.users.update_one( {"username": user.username}, @@ -165,7 +165,7 @@ def put_object_in_site( }, } ) - mdb.operations.insert_one(op.dict()) + mdb.operations.insert_one(op.model_dump()) return op diff --git a/nmdc_runtime/api/endpoints/users.py b/nmdc_runtime/api/endpoints/users.py index 601a5be8..5799ca3c 100644 --- a/nmdc_runtime/api/endpoints/users.py +++ b/nmdc_runtime/api/endpoints/users.py @@ -35,7 +35,7 @@ async def login_for_access_token( detail="Incorrect username or password", headers={"WWW-Authenticate": "Bearer"}, ) - access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.dict()) + access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.model_dump()) access_token = create_access_token( data={"sub": f"user:{user.username}"}, expires_delta=access_token_expires ) @@ -50,7 +50,7 @@ async def login_for_access_token( headers={"WWW-Authenticate": "Bearer"}, ) # TODO make below an absolute time - access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.dict()) + access_token_expires = timedelta(**ACCESS_TOKEN_EXPIRES.model_dump()) access_token = create_access_token( data={"sub": f"client:{form_data.client_id}"}, expires_delta=access_token_expires, @@ -58,7 +58,7 @@ async def login_for_access_token( return { "access_token": access_token, "token_type": "bearer", - "expires": ACCESS_TOKEN_EXPIRES.dict(), + "expires": ACCESS_TOKEN_EXPIRES.model_dump(), } @@ -84,8 +84,8 @@ def create_user( check_can_create_user(requester) mdb.users.insert_one( UserInDB( - **user_in.dict(), + **user_in.model_dump(), hashed_password=get_password_hash(user_in.password), - ).dict(exclude_unset=True) + ).model_dump(exclude_unset=True) ) return mdb.users.find_one({"username": user_in.username}) diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index d1c233b3..f8279efb 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -2,12 +2,14 @@ import os import re import tempfile +from datetime import datetime from functools import lru_cache from json import JSONDecodeError from pathlib import Path from time import time_ns from typing import List, Optional, Set, Tuple from urllib.parse import parse_qs, urlparse +from zoneinfo import ZoneInfo from bson import json_util from dagster import DagsterRunStatus @@ -429,13 +431,21 @@ def persist_content_and_get_drs_object( filepath = str(Path(save_dir).joinpath(filename)) with open(filepath, "w") as f: f.write(content) + now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat( + timespec="minutes" + ) object_in = DrsObjectIn( **drs_metadata_for( filepath, base={ - "description": description + f" (created by/for {username})", + "description": ( + description + + f" (created by/for {username}" + + f" at {now_to_the_minute})" + ), "access_methods": [{"access_id": drs_id}], }, + timestamp=now_to_the_minute, ) ) self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}" @@ -448,9 +458,11 @@ def _create_object( mdb: MongoDatabase, object_in: DrsObjectIn, mgr_site, drs_id, self_uri ): drs_obj = DrsObject( - **object_in.dict(exclude_unset=True), id=drs_id, self_uri=self_uri + **object_in.model_dump(exclude_unset=True), + id=drs_id, + self_uri=self_uri, ) - doc = drs_obj.dict(exclude_unset=True) + doc = drs_obj.model_dump(exclude_unset=True) doc["_mgr_site"] = mgr_site # manager site try: mdb.objects.insert_one(doc) @@ -511,16 +523,16 @@ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site): "workflow": job.workflow, "config": job.config, } - ).dict(exclude_unset=True), + ).model_dump(exclude_unset=True), "site_id": site.id, "model": dotted_path_for(JobOperationMetadata), }, } ) - mdb.operations.insert_one(op.dict()) - mdb.jobs.replace_one({"id": job.id}, job.dict(exclude_unset=True)) + mdb.operations.insert_one(op.model_dump()) + mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True)) - return op.dict(exclude_unset=True) + return op.model_dump(exclude_unset=True) @lru_cache diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 5f4d799a..007f3cc5 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -3,9 +3,11 @@ from importlib import import_module from importlib.metadata import version +import fastapi import uvicorn from fastapi import APIRouter, FastAPI from fastapi.middleware.cors import CORSMiddleware +from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse @@ -232,7 +234,7 @@ def ensure_initial_resources_on_boot(): collection_boot = import_module(f"nmdc_runtime.api.boot.{collection_name}") for model in collection_boot.construct(): - doc = model.dict() + doc = model.model_dump() mdb[collection_name].replace_one({"id": doc["id"]}, doc, upsert=True) username = os.getenv("API_ADMIN_USER") @@ -244,7 +246,7 @@ def ensure_initial_resources_on_boot(): username=username, hashed_password=get_password_hash(os.getenv("API_ADMIN_PASS")), site_admin=[os.getenv("API_SITE_ID")], - ).dict(exclude_unset=True), + ).model_dump(exclude_unset=True), upsert=True, ) mdb.users.create_index("username") @@ -265,7 +267,7 @@ def ensure_initial_resources_on_boot(): ), ) ], - ).dict(), + ).model_dump(), upsert=True, ) @@ -338,9 +340,20 @@ async def root(): ) +@api_router.get("/version") +async def get_versions(): + return { + "nmdc-runtime": get_version(), + "fastapi": fastapi.__version__, + "nmdc-schema": version("nmdc_schema"), + } + + app = FastAPI( title="NMDC Runtime API", - version="0.2.0", + # TODO this does not work: `version=get_version()` + # Below is hotfix for reasonable display in prod deployment. + version="1.0.7", description=( "The NMDC Runtime API, via on-demand functions " "and via schedule-based and sensor-based automation, " diff --git a/nmdc_runtime/api/models/object.py b/nmdc_runtime/api/models/object.py index 26af100c..17df772c 100644 --- a/nmdc_runtime/api/models/object.py +++ b/nmdc_runtime/api/models/object.py @@ -12,6 +12,7 @@ BaseModel, AnyUrl, HttpUrl, + field_serializer, ) from typing_extensions import Annotated @@ -31,6 +32,10 @@ class AccessURL(BaseModel): headers: Optional[Dict[str, str]] = None url: AnyUrl + @field_serializer("url") + def serialize_url(self, url: AnyUrl, _info): + return str(url) + class AccessMethod(BaseModel): access_id: Optional[Annotated[str, StringConstraints(min_length=1)]] = None @@ -78,6 +83,12 @@ def no_contents_means_single_blob(cls, values): raise ValueError("no contents means no further nesting, so id required") return values + @field_serializer("drs_uri") + def serialize_url(self, drs_uri: Optional[List[AnyUrl]], _info): + if drs_uri is not None and len(drs_uri) > 0: + return [str(u) for u in drs_uri] + return drs_uri + ContentsObject.update_forward_refs() @@ -127,6 +138,10 @@ class DrsObject(DrsObjectIn): id: DrsId self_uri: AnyUrl + @field_serializer("self_uri") + def serialize_url(self, self_uri: AnyUrl, _info): + return str(self_uri) + Seconds = Annotated[int, Field(strict=True, gt=0)] @@ -135,6 +150,10 @@ class ObjectPresignedUrl(BaseModel): url: HttpUrl expires_in: Seconds = 300 + @field_serializer("url") + def serialize_url(self, url: HttpUrl, _info): + return str(url) + class DrsObjectOutBase(DrsObjectBase): checksums: List[Checksum] @@ -145,6 +164,10 @@ class DrsObjectOutBase(DrsObjectBase): updated_time: Optional[datetime.datetime] = None version: Optional[str] = None + @field_serializer("self_uri") + def serialize_url(self, slf_uri: AnyUrl, _info): + return str(self_uri) + class DrsObjectBlobOut(DrsObjectOutBase): access_methods: List[AccessMethod] diff --git a/nmdc_runtime/api/models/operation.py b/nmdc_runtime/api/models/operation.py index e1819f24..035100cd 100644 --- a/nmdc_runtime/api/models/operation.py +++ b/nmdc_runtime/api/models/operation.py @@ -1,7 +1,7 @@ import datetime from typing import Generic, TypeVar, Optional, List, Any, Union -from pydantic import StringConstraints, BaseModel, HttpUrl +from pydantic import StringConstraints, BaseModel, HttpUrl, field_serializer from nmdc_runtime.api.models.util import ResultT from typing_extensions import Annotated @@ -59,3 +59,7 @@ class ObjectPutMetadata(Metadata): site_id: str url: HttpUrl expires_in_seconds: int + + @field_serializer("url") + def serialize_url(self, url: HttpUrl, _info): + return str(url) diff --git a/nmdc_runtime/api/models/run.py b/nmdc_runtime/api/models/run.py index 49fa37ba..3cdf43d1 100644 --- a/nmdc_runtime/api/models/run.py +++ b/nmdc_runtime/api/models/run.py @@ -93,7 +93,7 @@ def _add_run_requested_event(run_spec: RunUserSpec, mdb: MongoDatabase, user: Us time=now(as_str=True), inputs=run_spec.inputs, ) - mdb.run_events.insert_one(event.dict()) + mdb.run_events.insert_one(event.model_dump()) return run_id @@ -113,7 +113,7 @@ def _add_run_started_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.STARTED, time=now(as_str=True), - ).dict() + ).model_dump() ) return run_id @@ -134,7 +134,7 @@ def _add_run_fail_event(run_id: str, mdb: MongoDatabase): job=requested.job, type=RunEventType.FAIL, time=now(as_str=True), - ).dict() + ).model_dump() ) return run_id @@ -156,6 +156,6 @@ def _add_run_complete_event(run_id: str, mdb: MongoDatabase, outputs: List[str]) type=RunEventType.COMPLETE, time=now(as_str=True), outputs=outputs, - ).dict() + ).model_dump() ) return run_id diff --git a/nmdc_runtime/core/exceptions/token.py b/nmdc_runtime/core/exceptions/token.py index c5e9b1c3..afe00871 100644 --- a/nmdc_runtime/core/exceptions/token.py +++ b/nmdc_runtime/core/exceptions/token.py @@ -1,4 +1,4 @@ -from core.exceptions import CustomException +from nmdc_runtime.core.exceptions import CustomException class DecodeTokenException(CustomException): diff --git a/nmdc_runtime/minter/adapters/repository.py b/nmdc_runtime/minter/adapters/repository.py index 25382731..96775c92 100644 --- a/nmdc_runtime/minter/adapters/repository.py +++ b/nmdc_runtime/minter/adapters/repository.py @@ -97,7 +97,7 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) ) for id_ in ids: - self.db[id_.id] = id_.dict() + self.db[id_.id] = id_.model_dump() return ids def bind(self, req_bind: BindingRequest) -> Identifier: @@ -184,7 +184,7 @@ def mint(self, req_mint: MintingRequest) -> list[Identifier]: ) for id_name in not_taken ] - self.db["minter.id_records"].insert_many([i.dict() for i in ids]) + self.db["minter.id_records"].insert_many([i.model_dump() for i in ids]) collected.extend(ids) if len(collected) == req_mint.how_many: break diff --git a/nmdc_runtime/minter/entrypoints/fastapi_app.py b/nmdc_runtime/minter/entrypoints/fastapi_app.py index d0ac097f..996cb575 100644 --- a/nmdc_runtime/minter/entrypoints/fastapi_app.py +++ b/nmdc_runtime/minter/entrypoints/fastapi_app.py @@ -37,7 +37,11 @@ def mint_ids( requester = Entity(id=site.id) try: minted = s.mint( - MintingRequest(service=service, requester=requester, **req_mint.dict()) + MintingRequest( + service=service, + requester=requester, + **req_mint.model_dump(), + ) ) return [d.id for d in minted] except MinterError as e: diff --git a/nmdc_runtime/site/drsobjects/ingest.py b/nmdc_runtime/site/drsobjects/ingest.py index b4b7dc38..42eeb8a5 100644 --- a/nmdc_runtime/site/drsobjects/ingest.py +++ b/nmdc_runtime/site/drsobjects/ingest.py @@ -44,7 +44,7 @@ def claim_metadata_ingest_jobs( ) jobs = [] while True: - rv = client.list_jobs(lr.dict()).json() + rv = client.list_jobs(lr.model_dump()).json() jobs.extend(rv["resources"]) if "next_page_token" not in rv: break diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 828fe4e8..eecd4665 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -267,7 +267,7 @@ def get_operation(context): def produce_curated_db(context, op: Operation): client: RuntimeApiSiteClient = context.resources.runtime_api_site_client mdb: MongoDatabase = context.resources.mongo.db - op = Operation[ResultT, JobOperationMetadata](**op.dict()) + op = Operation[ResultT, JobOperationMetadata](**op.model_dump()) op_meta: JobOperationMetadata = op.metadata job_id = op_meta.job.id job = mdb.jobs.find_one({"id": job_id}) @@ -350,7 +350,7 @@ def filter_ops_undone_expired() -> str: @op(required_resource_keys={"runtime_api_site_client"}) def list_operations(context, filter_: str) -> list: client = context.resources.runtime_api_site_client - ops = [op.dict() for op in client.list_operations({"filter": filter_})] + ops = [op.model_dump() for op in client.list_operations({"filter": filter_})] context.log.info(str(len(ops))) return ops @@ -466,7 +466,7 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn): op = Operation(**mdb.operations.find_one({"id": op_id})) op.done = True op.result = {"update_cmd": json.dumps(update_cmd)} - op_doc = op.dict(exclude_unset=True) + op_doc = op.model_dump(exclude_unset=True) mdb.operations.replace_one({"id": op_id}, op_doc) return ["/operations/" + op_doc["id"]] diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 44270249..bbd7ae5e 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -405,7 +405,7 @@ def claim_and_run_apply_changesheet_jobs(_context): def done_object_put_ops(_context): client = get_runtime_api_site_client(run_config_frozen__normal_env) ops = [ - op.dict() + op.model_dump() for op in client.list_operations( { "filter": json.dumps( diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index f344d9e0..a831da33 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -17,7 +17,7 @@ from fastjsonschema import JsonSchemaValueException from frozendict import frozendict from linkml_runtime.dumpers import json_dumper -from pydantic import BaseModel +from pydantic import BaseModel, AnyUrl from pymongo import MongoClient, ReplaceOne, InsertOne from terminusdb_client import WOQLClient from toolz import get_in @@ -62,7 +62,7 @@ def request(self, method, url_path, params_or_json_data=None): self.ensure_token() kwargs = {"url": self.base_url + url_path, "headers": self.headers} if isinstance(params_or_json_data, BaseModel): - params_or_json_data = params_or_json_data.dict(exclude_unset=True) + params_or_json_data = params_or_json_data.model_dump(exclude_unset=True) if method.upper() == "GET": kwargs["params"] = params_or_json_data else: @@ -234,15 +234,17 @@ def get_object_bytes(self, object_id) -> requests.Response: access = AccessURL( **self.get_object_access(object_id, method.access_id).json() ) - if access.url.startswith( + if str(access.url).startswith( os.getenv("API_HOST_EXTERNAL") ) and self.base_url == os.getenv("API_HOST"): - access.url = access.url.replace( - os.getenv("API_HOST_EXTERNAL"), os.getenv("API_HOST") + access.url = AnyUrl( + str(access.url).replace( + os.getenv("API_HOST_EXTERNAL"), os.getenv("API_HOST") + ) ) else: access = AccessURL(url=method.access_url.url) - return requests.get(access.url) + return requests.get(str(access.url)) def list_jobs(self, list_request=None): if list_request is None: diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 23e94dd1..475a98d4 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -82,7 +82,7 @@ def put_object(filepath, url, mime_type=None): return requests.put(url, data=f, headers={"Content-Type": mime_type}) -def drs_metadata_for(filepath, base=None): +def drs_metadata_for(filepath, base=None, timestamp=None): """given file path, get drs metadata required: size, created_time, and at least one checksum. @@ -96,7 +96,7 @@ def drs_metadata_for(filepath, base=None): ) if "checksums" not in base: base["checksums"] = [ - {"type": "sha256", "checksum": sha256hash_from_file(filepath)} + {"type": "sha256", "checksum": sha256hash_from_file(filepath, timestamp)} ] if "mime_type" not in base: base["mime_type"] = mimetypes.guess_type(filepath)[0] diff --git a/requirements/dev.in b/requirements/dev.in index 1b7450f2..dbe7b8e9 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -10,6 +10,5 @@ pytest-asyncio pytest-cov requests-mock setuptools -setuptools-scm twine requests-cache \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 2a4d676b..b87ef35a 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -85,7 +85,6 @@ packaging==23.2 # black # build # pytest - # setuptools-scm pathspec==0.11.2 # via # -c requirements/main.txt @@ -149,19 +148,27 @@ rfc3986==2.0.0 # via twine rich==13.6.0 # via twine -setuptools-scm==8.0.4 - # via -r requirements/dev.in six==1.16.0 # via # -c requirements/main.txt # requests-mock # url-normalize +tomli==2.0.1 + # via + # -c requirements/main.txt + # black + # build + # coverage + # pip-tools + # pyproject-hooks + # pytest twine==4.0.2 # via -r requirements/dev.in typing-extensions==4.8.0 # via # -c requirements/main.txt - # setuptools-scm + # black + # cattrs url-normalize==1.4.3 # via # -c requirements/main.txt @@ -189,4 +196,3 @@ setuptools==68.2.2 # -c requirements/main.txt # -r requirements/dev.in # pip-tools - # setuptools-scm diff --git a/requirements/main.in b/requirements/main.in index 1a53fd78..c5c46de0 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==8.0.0 +nmdc-schema==9.1.0 openpyxl pandas passlib[bcrypt] @@ -36,6 +36,7 @@ python-multipart pyyaml requests semver +setuptools-scm tenacity terminusdb-client toolz diff --git a/requirements/main.txt b/requirements/main.txt index c1b44f0b..c9e99118 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -446,7 +446,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.5.8 # via ipykernel -nmdc-schema==8.0.0 +nmdc-schema==9.1.0 # via -r requirements/main.in notebook==7.0.6 # via jupyter @@ -477,6 +477,7 @@ packaging==23.2 # pytest # qtconsole # qtpy + # setuptools-scm # sphinx paginate==0.5.6 # via mkdocs-material @@ -715,7 +716,9 @@ rpds-py==0.12.0 rsa==4.9 # via python-jose ruamel-yaml==0.18.5 - # via linkml-dataops + # via + # linkml-dataops + # nmdc-schema ruamel-yaml-clib==0.2.8 # via ruamel-yaml s3transfer==0.7.0 @@ -724,6 +727,8 @@ semver==3.0.2 # via -r requirements/main.in send2trash==1.8.2 # via jupyter-server +setuptools-scm==8.0.4 + # via -r requirements/main.in shed==2023.6.1 # via terminusdb-client shexjsg==0.8.2 @@ -857,6 +862,7 @@ typing-extensions==4.8.0 # prefixmaps # pydantic # pydantic-core + # setuptools-scm # sqlalchemy # typing-inspect typing-inspect==0.9.0 @@ -919,3 +925,4 @@ setuptools==68.2.2 # via # dagster # mkdocs-mermaid2-plugin + # setuptools-scm diff --git a/tests/files/nmdc_bsm-12-7mysck21.json b/tests/files/nmdc_bsm-12-7mysck21.json new file mode 100644 index 00000000..d0571f47 --- /dev/null +++ b/tests/files/nmdc_bsm-12-7mysck21.json @@ -0,0 +1,48 @@ +{ + "analysis_type": [ + "metagenomics" + ], + "biosample_categories": [ + "NEON" + ], + "collection_date": { + "has_raw_value": "2014-07-15T18:00Z" + }, + "depth": { + "has_maximum_numeric_value": 1, + "has_minimum_numeric_value": 0, + "has_unit": "meters" + }, + "elev": 1179.5, + "env_broad_scale": { + "term": { + "id": "ENVO:01000253", + "name": "freshwater river biome" + } + }, + "env_local_scale": { + "term": { + "id": "ENVO:03600095", + "name": "stream run" + } + }, + "env_medium": { + "term": { + "id": "ENVO:01001057", + "name": "environment associated with a plant part or small plant" + } + }, + "geo_loc_name": { + "has_raw_value": "USA: Colorado, Arikaree River" + }, + "id": "nmdc:bsm-12-7mysck21", + "lat_lon": { + "latitude": 39.758206, + "longitude": -102.447148 + }, + "name": "ARIK.20140715.AMC.EPIPHYTON.5", + "part_of": [ + "nmdc:sty-11-34xj1150" + ], + "type": "nmdc:Biosample" +} diff --git a/tests/files/nmdc_sty-11-pzmd0x14.json b/tests/files/nmdc_sty-11-pzmd0x14.json new file mode 100644 index 00000000..114437c0 --- /dev/null +++ b/tests/files/nmdc_sty-11-pzmd0x14.json @@ -0,0 +1,65 @@ +{ + "id": "nmdc:sty-11-pzmd0x14", + "name": "National Ecological Observatory Network: benthic metagenomes (DP1.20279.001)", + "type": "nmdc:Study", + "title": "National Ecological Observatory Network: benthic metagenomes (DP1.20279.001)", + "description": "The National Science Foundation's National Ecological Observatory Network (NEON) is a continental-scale observation facility operated by Battelle and designed to collect long-term open access ecological data to better understand how U.S. ecosystems are changing.", + "websites": [ + "https://www.neonscience.org/", + "https://data.neonscience.org/data-products/DP1.20279.001", + "https://data.neonscience.org/api/v0/documents/NEON_metagenomes_userGuide_vE.pdf" + ], + "funding_sources": [ + "NSF#1724433 National Ecological Observatory Network: Operations Activities" + ], + "principal_investigator": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424", + "has_raw_value": "Kate Thibault" + }, + "has_credit_associations": [ + { + "applies_to_person": { + "name": "Hugh Cross", + "email": "crossh@battelleecology.org", + "orcid": "orcid:0000-0002-6745-9479" + }, + "applied_roles": [ + "Methodology", + "Data curation" + ] + }, + { + "applies_to_person": { + "name": "Kate Thibault", + "email": "kthibault@battelleecology.org", + "orcid": "orcid:0000-0003-3477-6424" + }, + "applied_roles": [ + "Principal Investigator" + ] + }, + { + "applies_to_person": { + "name": "Stephanie Parker", + "email": "sparker@battelleecology.org", + "orcid": "0000-0002-7180-7245" + }, + "applied_roles": [ + "Methodology", + "Data curation" + ] + } + ], + "study_image": [ + { + "url": "https://portal.nersc.gov/project/m3408/profile_images/nmdc_sty-11-34xj1150.jpg" + } + ], + "gold_study_identifiers": [], + "part_of": [ + "nmdc:sty-11-nxrz9m96" + ], + "study_category": "consortium" +} diff --git a/tests/integration/test_minter_repository.py b/tests/integration/test_minter_repository.py index 96199670..2524ade4 100644 --- a/tests/integration/test_minter_repository.py +++ b/tests/integration/test_minter_repository.py @@ -29,7 +29,10 @@ def test_mint_and_resolve(): s: InMemoryIDStore = get_test_inmemoryidstore() req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_res = ResolutionRequest(id_name=id_.name, **req_mint.dict()) + req_res = ResolutionRequest( + id_name=id_.name, + **req_mint.model_dump(), + ) assert s.resolve(req_res) is not None @@ -37,9 +40,12 @@ def test_mint_and_delete(): s: InMemoryIDStore = get_test_inmemoryidstore() req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_del = DeleteRequest(id_name=id_.name, **req_mint.dict()) + req_del = DeleteRequest( + id_name=id_.name, + **req_mint.model_dump(), + ) s.delete(req_del) - assert s.resolve(ResolutionRequest(**req_del.dict())) is None + assert s.resolve(ResolutionRequest(**req_del.model_dump())) is None def test_mongo_mint_one(): @@ -70,7 +76,10 @@ def test_mongo_mint_and_resolve(): req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_res = ResolutionRequest(id_name=id_.name, **req_mint.dict()) + req_res = ResolutionRequest( + id_name=id_.name, + **req_mint.model_dump(), + ) assert s.resolve(req_res) is not None @@ -80,7 +89,10 @@ def test_mongo_mint_and_delete(): req_mint = minting_request() id_: Identifier = next(i for i in s.mint(req_mint)) - req_del = DeleteRequest(id_name=id_.name, **req_mint.dict()) + req_del = DeleteRequest( + id_name=id_.name, + **req_mint.model_dump(), + ) s.delete(req_del) - assert s.resolve(ResolutionRequest(**req_del.dict())) is None + assert s.resolve(ResolutionRequest(**req_del.model_dump())) is None assert s.db["minter.id_records"].count_documents({}) == 0 diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 1dd677cc..68a597ff 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -1,4 +1,6 @@ +import json import os +import re import pytest import requests @@ -7,13 +9,17 @@ from toolz import get_in from nmdc_runtime.api.core.auth import get_password_hash +from nmdc_runtime.api.core.metadata import df_from_sheet_in, _validate_changesheet from nmdc_runtime.api.core.util import generate_secret, dotted_path_for from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object from nmdc_runtime.api.models.job import Job, JobOperationMetadata +from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.api.models.site import SiteInDB, SiteClientInDB from nmdc_runtime.api.models.user import UserInDB, UserIn, User from nmdc_runtime.site.repository import run_config_frozen__normal_env from nmdc_runtime.site.resources import get_mongo, RuntimeApiSiteClient +from nmdc_runtime.util import REPO_ROOT_DIR def ensure_test_resources(mdb): @@ -26,7 +32,7 @@ def ensure_test_resources(mdb): username=username, hashed_password=get_password_hash(password), site_admin=[site_id], - ).dict(exclude_unset=True), + ).model_dump(exclude_unset=True), upsert=True, ) @@ -42,7 +48,7 @@ def ensure_test_resources(mdb): hashed_secret=get_password_hash(client_secret), ) ], - ).dict(), + ).model_dump(), upsert=True, ) wf_id = "test" @@ -50,7 +56,9 @@ def ensure_test_resources(mdb): prev_ops = {"metadata.job.id": job_id, "metadata.site_id": site_id} mdb.operations.delete_many(prev_ops) job = Job(**{"id": job_id, "workflow": {"id": wf_id}, "config": {}, "claims": []}) - mdb.jobs.replace_one({"id": job_id}, job.dict(exclude_unset=True), upsert=True) + mdb.jobs.replace_one( + {"id": job_id}, job.model_dump(exclude_unset=True), upsert=True + ) return { "site_client": { "site_id": site_id, @@ -58,7 +66,7 @@ def ensure_test_resources(mdb): "client_secret": client_secret, }, "user": {"username": username, "password": password}, - "job": job.dict(exclude_unset=True), + "job": job.model_dump(exclude_unset=True), } @@ -114,7 +122,7 @@ def get_token(): "POST", url=(base_url + "/users"), headers=headers, - json=user_in.dict(exclude_unset=True), + json=user_in.model_dump(exclude_unset=True), ) try: @@ -181,3 +189,41 @@ def test_metadata_validate_json_with_unknown_collection(api_site_client): {"studi_set": []}, ) assert rv.json()["result"] == "errors" + + +def test_submit_changesheet(): + sheet_in = ChangesheetIn( + name="sheet", + content_type="text/tab-separated-values", + text="id\taction\tattribute\tvalue\nnmdc:bsm-12-7mysck21\tupdate\tpart_of\tnmdc:sty-11-pzmd0x14\n", + ) + mdb = get_mongo_db() + rs = ensure_test_resources(mdb) + if not mdb.biosample_set.find_one({"id": "nmdc:bsm-12-7mysck21"}): + mdb.biosample_set.insert_one( + json.loads( + ( + REPO_ROOT_DIR / "tests" / "files" / "nmdc_bsm-12-7mysck21.json" + ).read_text() + ) + ) + if not mdb.study_set.find_one({"id": "nmdc:sty-11-pzmd0x14"}): + mdb.study_set.insert_one( + json.loads( + ( + REPO_ROOT_DIR / "tests" / "files" / "nmdc_sty-11-pzmd0x14.json" + ).read_text() + ) + ) + df_change = df_from_sheet_in(sheet_in, mdb) + _ = _validate_changesheet(df_change, mdb) + drs_obj_doc = persist_content_and_get_drs_object( + content=sheet_in.text, + username=rs["user"]["username"], + filename=re.sub(r"[^A-Za-z0-9._\-]", "_", sheet_in.name), + content_type=sheet_in.content_type, + description="changesheet", + id_ns="changesheets", + ) + mdb.objects.delete_one({"id": drs_obj_doc["id"]}) + assert True diff --git a/tests/test_api/test_metadata.py b/tests/test_api/test_metadata.py index 9fc63254..82b6e70f 100644 --- a/tests/test_api/test_metadata.py +++ b/tests/test_api/test_metadata.py @@ -7,6 +7,8 @@ import pytest from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object +from nmdc_runtime.api.models.metadata import ChangesheetIn from nmdc_runtime.util import get_nmdc_jsonschema_dict from toolz import dissoc @@ -15,6 +17,8 @@ update_mongo_db, mongo_update_command_for, copy_docs_in_update_cmd, + df_from_sheet_in, + _validate_changesheet, ) from nmdc_runtime.site.ops import ensure_data_object_type from nmdc_runtime.site.repository import run_config_frozen__normal_env diff --git a/tests/test_graphs/ensure_jobs.py b/tests/test_graphs/ensure_jobs.py new file mode 100644 index 00000000..5a7359dd --- /dev/null +++ b/tests/test_graphs/ensure_jobs.py @@ -0,0 +1,26 @@ +import pytest +from toolz import merge + +from nmdc_runtime.site.graphs import ensure_jobs +from nmdc_runtime.site.repository import preset_normal + + +@pytest.skip("Needs supplied state") +def test_ensure_jobs(): + job = ensure_jobs.to_job(name="test_ensure_jobs", **preset_normal) + run_config = merge({}, preset_normal["config"]) + run_config["ops"] = { + "construct_jobs": { + "config": { + "base_jobs": [ + { + "config": {"object_id": "gfs03r29"}, + "workflow": {"id": "apply-changesheet-1.0"}, + } + ] + } + } + } + result = job.execute_in_process(run_config=run_config) + + assert result.success diff --git a/tests/unit/core_util.py b/tests/unit/core_util.py new file mode 100644 index 00000000..0fbd2873 --- /dev/null +++ b/tests/unit/core_util.py @@ -0,0 +1,21 @@ +from datetime import datetime, timedelta +from pathlib import Path +from zoneinfo import ZoneInfo + +from nmdc_runtime.api.core.util import sha256hash_from_file + +TEST_FILES_DIR = Path(__file__).parent.parent.joinpath("files") + + +def test_sha256hash_from_file_is_timestamp_dependent(): + file_path = str(TEST_FILES_DIR.joinpath("test_changesheet_update_one_ph.tsv")) + ts_1 = datetime.now(tz=ZoneInfo("America/Los_Angeles")) + ts_2 = ts_1 + timedelta(minutes=1) + hashes = [] + for ts in (ts_1, ts_2): + hashes.append( + sha256hash_from_file( + file_path=file_path, timestamp=ts.isoformat(timespec="minutes") + ) + ) + assert hashes[0] != hashes[1] diff --git a/util/mongorestore-nmdc.sh b/util/mongorestore-nmdc.sh index e0f5f253..aa23ccfb 100755 --- a/util/mongorestore-nmdc.sh +++ b/util/mongorestore-nmdc.sh @@ -4,4 +4,4 @@ # $ ./util/mongorestore-nmdc.sh mongorestore -h $MONGO_HOST -u $MONGO_USERNAME -p $MONGO_PASSWORD --authenticationDatabase=admin \ --gzip --drop \ - $HOME/nmdcdb-mongodump/nmdcdb/2023-05-24T11/ \ No newline at end of file + $HOME/nmdcdb-mongodump/nmdcdb/2023-11-02T11/ \ No newline at end of file