diff --git a/demo/metadata_migration/notebooks/bookkeeper.py b/demo/metadata_migration/notebooks/bookkeeper.py new file mode 100644 index 00000000..dc8e7e11 --- /dev/null +++ b/demo/metadata_migration/notebooks/bookkeeper.py @@ -0,0 +1,125 @@ +from typing import Optional +from enum import Enum +from datetime import datetime + +from pymongo import MongoClient +from nmdc_schema.migrators.migrator_base import MigratorBase + + +class MigrationEvent(str, Enum): + r""" + Enumeration of all migration events that can be recorded. + Reference: https://docs.python.org/3.10/library/enum.html#others + + >>> MigrationEvent.MIGRATION_COMPLETED.value + 'MIGRATION_COMPLETED' + >>> MigrationEvent.MIGRATION_STARTED.value + 'MIGRATION_STARTED' + """ + MIGRATION_STARTED = "MIGRATION_STARTED" + MIGRATION_COMPLETED = "MIGRATION_COMPLETED" + + +class Bookkeeper: + r""" + A class you can use to record migration-related events in a Mongo database. + """ + def __init__( + self, + mongo_client: MongoClient, + database_name: str = "nmdc", + collection_name: str = "_migration_events", + view_name: str = "_migration_latest_schema_version", + ): + self.database_name = database_name + self.collection_name = collection_name + self.view_name = view_name + + # Store references to the database and collection. + self.db = mongo_client[self.database_name] + self.collection = self.db.get_collection(self.collection_name) + + # Ensure the MongoDB view that indicates the current schema version, exists. + self.ensure_current_schema_version_view_exists() + + @staticmethod + def get_current_timestamp() -> str: + r"""Returns an ISO 8601 timestamp (string) representing the current time in UTC.""" + utc_now = datetime.utcnow() + iso_utc_now = utc_now.isoformat() + return iso_utc_now # e.g. "2024-02-21T04:31:03.115107" + + def record_migration_event( + self, + migrator: MigratorBase, + event: MigrationEvent, + to_schema_version: Optional[str] = None, + ) -> None: + r""" + Records a migration event in the collection. + + The `to_schema_version` parameter is independent of the `migrator` parameter because, even though the migrator + does have a `.get_destination_version()` method, the string returned by that method is the one defined when the + migrator was _written_, which is sometimes more generic than the one used for data validation when the migrator + is _run_ (e.g. "1.2" as opposed to "1.2.3"). So, this method provides a means by which the calling code can, + optionally, specify a more precise version identifier. + """ + + # If a custom schema version identifier was specified, use that; otherwise, use the one built into the migrator. + to_schema_version_str = migrator.get_destination_version() + if to_schema_version is not None: + to_schema_version_str = to_schema_version + + document = dict( + created_at=self.get_current_timestamp(), + event=event.value, + from_schema_version=migrator.get_origin_version(), + to_schema_version=to_schema_version_str, + migrator_module=migrator.__module__, # name of the Python module in which the `Migrator` class is defined + ) + self.collection.insert_one(document) + + def ensure_current_schema_version_view_exists(self) -> None: + r""" + Ensures the MongoDB view that indicates the current schema version, exists. + + References: + - https://www.mongodb.com/community/forums/t/is-there-anyway-to-create-view-using-python/161363/2 + - https://www.mongodb.com/docs/manual/reference/method/db.createView/#mongodb-method-db.createView + - https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database.create_collection + """ + if ( + self.view_name not in self.db.list_collection_names() + ): # returns list of names of both collections and views + agg_pipeline = [ + # Sort the documents so the most recent one is first. + {"$sort": {"created_at": -1}}, + # Only preserve the first document. + {"$limit": 1}, + # Derive a simplified document for the view. + # Examples: `{ "schema_version": "1.2.3" }` or `{ "schema_version": null }` + { + "$project": { + "_id": 0, # omit the `_id` field + "schema_version": { # add this field based upon the migration status + "$cond": { + "if": { + "$eq": [ + "$event", + MigrationEvent.MIGRATION_COMPLETED.value, + ] + }, + "then": "$to_schema_version", # database conforms to this version of the NMDC Schema + "else": None, # database doesn't necessarily conform to any version of the NMDC Schema + } + }, + } + }, + ] + self.db.create_collection( + name=self.view_name, + viewOn=self.collection_name, + pipeline=agg_pipeline, + check_exists=True, # only create the view if it doesn't already exist + comment="The version of the NMDC Schema to which the database conforms.", + ) diff --git a/demo/metadata_migration/notebooks/manual_test_bookkeeper.py b/demo/metadata_migration/notebooks/manual_test_bookkeeper.py new file mode 100644 index 00000000..d315223e --- /dev/null +++ b/demo/metadata_migration/notebooks/manual_test_bookkeeper.py @@ -0,0 +1,186 @@ +# Note: This module's name starts with `manual_test_` instead of `test_` to signify that its author (currently) expects +# people to run it manually as opposed to via automated test infrastructure. Its author chose that route after a +# GitHub Actions workflow that runs tests in `test_` modules kept failing due to a database access issue. + +from typing import Optional +import unittest +import re +import os +from datetime import datetime, timedelta + +from pymongo import MongoClient, timeout +from pymongo.database import Database +from nmdc_schema.migrators.migrator_base import MigratorBase + +from demo.metadata_migration.notebooks.bookkeeper import Bookkeeper, MigrationEvent + +# Consume environment variables. +MONGO_HOST: str = os.getenv("MONGO_HOST", "localhost") +MONGO_USER: Optional[str] = os.getenv("MONGO_USERNAME", None) +MONGO_PASS: Optional[str] = os.getenv("MONGO_PASSWORD", None) +MONGO_DATABASE_NAME: str = os.getenv("MONGO_TEST_DBNAME", "test-migration-bookkeeper") + +MONGO_TIMEOUT_DURATION: int = 3 # seconds + + +class FakeMigrator(MigratorBase): + _to_version = "A.B.C" + _from_version = "X.Y.Z" + + def upgrade(self): + pass + + +class TestBookkeeper(unittest.TestCase): + r""" + Tests targeting the `Bookkeeper` class. + + You can format this file like this: + $ python -m black demo/metadata_migration/notebooks/manual_test_bookkeeper.py + + You can start up a containerized MongoDB server like this: + $ docker run --rm --detach --name mongo-test-migration-bookkeeper -p 27017:27017 mongo + + One that's running, other containers will be able to access it via: + - host.docker.internal:27017 + + You can run these tests like this: + $ python -m unittest -v demo/metadata_migration/notebooks/manual_test_bookkeeper.py + + Reference: https://docs.python.org/3/library/unittest.html#basic-example + """ + + mongo_client: Optional[MongoClient] = None + db: Optional[Database] = None + + def setUp(self) -> None: + r""" + Connects to the MongoDB server and gets a reference to the database. + + Note: This function runs before each test starts. + """ + + # Connect to the MongoDB server and store a reference to the connection. + self.mongo_client = MongoClient( + host=MONGO_HOST, + username=MONGO_USER, + password=MONGO_PASS, + ) + with timeout(MONGO_TIMEOUT_DURATION): + # Try connecting to the database server. + _ = self.mongo_client.server_info() + db = self.mongo_client[MONGO_DATABASE_NAME] + + # Ensure the database contains no collections. + if len(db.list_collection_names()): + raise KeyError(f"Database is not empty: {MONGO_DATABASE_NAME}") + + # Store a reference to the database. + self.db = db + + def tearDown(self) -> None: + r""" + Drops all collections in the database and closes the connection to the MongoDB server. + + Note: This function runs after each test finishes. + """ + + # Drop all collections in the database. + for collection_name in self.db.list_collection_names(): + self.db.drop_collection(collection_name) + + # Close the connection to the server. + self.mongo_client.close() + + def test_get_current_timestamp(self): + pattern = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}$" + ts_str = Bookkeeper.get_current_timestamp() + + # Verify the timestamp is a string having a valid format. + self.assertIsInstance(ts_str, str) + self.assertTrue(re.match(pattern, ts_str)) + + # Verify the moment represented by the timestamp was within the past minute. + ts = datetime.fromisoformat(ts_str) + now_ts = datetime.now() + time_difference = now_ts - ts + self.assertLess(time_difference, timedelta(minutes=1)) + + def test_init_method(self): + # Confirm the view does not exist yet. + view_name = "test_view" + self.assertFalse(view_name in self.db.list_collection_names()) + + # Instantiate the class-under-test. + _ = Bookkeeper( + mongo_client=self.mongo_client, + database_name=MONGO_DATABASE_NAME, + view_name=view_name, + ) + + # Confirm the view exists now. + self.assertTrue(view_name in self.db.list_collection_names()) + + def test_record_migration_event(self): + # Instantiate a bookkeeper. + bk = Bookkeeper( + mongo_client=self.mongo_client, + database_name=MONGO_DATABASE_NAME, + ) + + # Verify the collection is empty. + collection = self.db.get_collection(bk.collection_name) + self.assertTrue(collection.count_documents({}) == 0) + + # Record a "migration started" event. + migrator = FakeMigrator() + bk.record_migration_event( + migrator=migrator, event=MigrationEvent.MIGRATION_STARTED + ) + + # Verify the migration event was recorded. + self.assertTrue(collection.count_documents({}) == 1) + doc = collection.find({})[0] + self.assertIsInstance(doc["created_at"], str) + self.assertIsInstance(doc["event"], str) + self.assertEqual(doc["event"], MigrationEvent.MIGRATION_STARTED) + self.assertIsInstance(doc["from_schema_version"], str) + self.assertEqual(doc["from_schema_version"], migrator.get_origin_version()) + self.assertIsInstance(doc["to_schema_version"], str) + self.assertEqual(doc["to_schema_version"], migrator.get_destination_version()) + self.assertIsInstance(doc["migrator_module"], str) + + # Verify the document in the view says the schema version is `null`. + # Note: That's what I expect, since no "migration complete" events have been recorded yet. + view = self.db.get_collection(bk.view_name) + self.assertTrue(view.count_documents({}) == 1) + view_doc = view.find({})[0] + self.assertIsNone(view_doc["schema_version"]) + + # Record a "migration completed" event. + bk.record_migration_event( + migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED + ) + + # Verify the migration event was recorded. + self.assertTrue(collection.count_documents({}) == 2) + + # Verify the document in the view says the schema version matches the one recorded. + view = self.db.get_collection(bk.view_name) + self.assertTrue(view.count_documents({}) == 1) + view_doc = view.find({})[0] + self.assertEqual(view_doc["schema_version"], migrator.get_destination_version()) + + # Finally, record another "migration started" event. + bk.record_migration_event( + migrator=migrator, event=MigrationEvent.MIGRATION_STARTED + ) + + # Confirm the document in the view once again says the schema version is `null`. + self.assertTrue(view.count_documents({}) == 1) + view_doc = view.find({})[0] + self.assertIsNone(view_doc["schema_version"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb b/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb new file mode 100644 index 00000000..b8b29f38 --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb @@ -0,0 +1,545 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "source": [ + "# Migrate MongoDB database from `nmdc-schema` `vA.B.C` to `vX.Y.Z`\n", + "\n", + "- TODO: Disable read/write access to the origin database during the migration process." + ] + }, + { + "cell_type": "markdown", + "id": "f65ad4ab", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "37d358ba", + "metadata": {}, + "source": [ + "### 1. Determine MongoDB collections involved.\n", + "\n", + "Here, you'll determine which MongoDB collections will be used as part of this migration.\n", + "\n", + "1. In the [`nmdc-schema` repo](https://github.com/microbiomedata/nmdc-schema/tree/main/nmdc_schema/migrators), go to the `nmdc_schema/migrators` directory and open the Python module whose name contains the two schema versions involved with this migration. For example, if migrating from schema version `A.B.C` to `X.Y.Z`, open the module named `migrator_from_A_B_C_to_X_Y_Z.py`.\n", + "2. Determine the collections that are accessed—whether for reading or for writing—by that module. **This is currently a manual process.**\n", + "3. Add their names to the `COLLECTION_NAMES` Python list below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09966b0d", + "metadata": {}, + "outputs": [], + "source": [ + "COLLECTION_NAMES: list[str] = [\n", + " \n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "17f351e8", + "metadata": {}, + "source": [ + "### 2. Coordinate with stakeholders.\n", + "\n", + "Identify the people that read/write to those collections, or that maintain software that reads/writes to those collection. You can view a list of stakeholders in `./stakeholders.md`. \n", + "\n", + "Once you have identified those people; coordinate with them to agree on a time window for the migration. You can contact them via Slack, for example." + ] + }, + { + "cell_type": "markdown", + "id": "233a35c3", + "metadata": {}, + "source": [ + "### 3. Set up environment.\n", + "\n", + "Here, you'll prepare an environment for running this notebook.\n", + "\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4\n", + " ```\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use `.notebook.env.example` as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate the two **MongoDB configuration files** that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", + " 1. You can use `.mongo.yaml.example` as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "id": "69937b18", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "fe81196a", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25a0af308c3185b", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==X.Y.Z" + ] + }, + { + "cell_type": "markdown", + "id": "a407c354", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the `import` statements is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dbecd561", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-05T00:46:18.764498Z", + "start_time": "2024-03-05T00:46:18.202997Z" + } + }, + "outputs": [], + "source": [ + "# Third-party packages:\n", + "import pymongo\n", + "from jsonschema import Draft7Validator\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "\n", + "# TODO: Update this statement for each migration.\n", + "from nmdc_schema.migrators.migrator_from_A_B_C_to_X_Y_Z import Migrator\n", + "\n", + "# First-party packages:\n", + "from helpers import Config\n", + "from bookkeeper import Bookkeeper, MigrationEvent" + ] + }, + { + "cell_type": "markdown", + "id": "99b20ff4", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1eac645a", + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path\n", + "\n", + "# Perform a sanity test of the application paths.\n", + "!{mongodump} --version\n", + "!{mongorestore} --version" + ] + }, + { + "cell_type": "markdown", + "id": "68245d2b", + "metadata": {}, + "source": [ + "### Create MongoDB clients\n", + "\n", + "Create MongoDB clients you can use to access the \"origin\" and \"transformer\" MongoDB servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e95f559", + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo client for \"origin\" MongoDB server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for \"transformer\" MongoDB server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "\n", + "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", + "with pymongo.timeout(3):\n", + " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", + " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the origin database exists.\n", + " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", + " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the transformation database does not exist.\n", + " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create a bookkeeper\n", + "\n", + "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." + ], + "metadata": { + "collapsed": false + }, + "id": "bc387abc62686091" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" + ], + "metadata": { + "collapsed": false + }, + "id": "5c982eb0c04e606d" + }, + { + "cell_type": "markdown", + "id": "3975ac24", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e2dbb92", + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", + "\n", + "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ] + }, + { + "cell_type": "markdown", + "id": "fd4994a0", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" MongoDB server\n", + "\n", + "Use `mongodump` to dump the collections involved in this migration **from** the \"origin\" MongoDB server **into** a local directory.\n", + "\n", + "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8fa1ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in COLLECTION_NAMES]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "print(exclusion_options_str)\n", + "\n", + "# Dump the not-excluded collections from the \"origin\" database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "c3e3c9c4", + "metadata": {}, + "source": [ + "### Load the dumped collections into the \"transformer\" MongoDB server\n", + "\n", + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", + "\n", + "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "418571c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "print(inclusion_options_str)\n", + "\n", + "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "4c090068", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" MongoDB server\n", + "\n", + "Use the migrator to transform the collections in the \"transformer\" database.\n", + "\n", + "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", + "\n", + "> Reminder: The \"origin\" database is **not** affected by this step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05869340", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", + "adapter = MongoAdapter(\n", + " database=transformer_mongo_client[\"nmdc\"],\n", + " on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", + " on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", + " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", + ")\n", + "\n", + "# Instantiate a Migrator bound to that adapter.\n", + "migrator = Migrator(adapter=adapter)\n", + "\n", + "# Execute the Migrator's `upgrade` method to perform the migration.\n", + "migrator.upgrade()" + ] + }, + { + "cell_type": "markdown", + "id": "3edf77c7", + "metadata": {}, + "source": [ + "### Validate the transformed documents\n", + "\n", + "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server.\n", + "\n", + "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db6e432d", + "metadata": {}, + "outputs": [], + "source": [ + "for collection_name in COLLECTION_NAMES:\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for document in collection.find():\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Indicate that the migration is underway\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration has started." + ], + "metadata": { + "collapsed": false + }, + "id": "997fcb281d9d3222" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" + ], + "metadata": { + "collapsed": false + }, + "id": "fcafd862e1becb98" + }, + { + "cell_type": "markdown", + "id": "1e0c8891", + "metadata": {}, + "source": [ + "### Dump the collections from the \"transformer\" MongoDB server\n", + "\n", + "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca49f61a", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the \"transformer\" MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "d84bdc11", + "metadata": {}, + "source": [ + "### Load the collections into the \"origin\" MongoDB server\n", + "\n", + "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", + "\n", + "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dfbcf0a", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Indicate that the migration is complete\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration is complete." + ], + "metadata": { + "collapsed": false + }, + "id": "ca5ee89a79148499" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" + ], + "metadata": { + "collapsed": false + }, + "id": "d1eaa6c92789c4f3" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/metadata_migration/notebooks/stakeholders.md b/demo/metadata_migration/notebooks/stakeholders.md new file mode 100644 index 00000000..07f24460 --- /dev/null +++ b/demo/metadata_migration/notebooks/stakeholders.md @@ -0,0 +1,31 @@ +# Stakeholders + +## Introduction + +This document contains information about "who or what uses which collections" in the Mongo database. + +This information can be used when, for example, +you're preparing to do something that involves a specific collection (e.g. migrating its data to a new schema version), +and you want to ensure that the people that use that collection (i.e. the stakeholders) are aware of your plans. + +## Table + +Here's a table of (a) names of Mongo collections and (b) the NMDC system components that write to them +(as of +**September 11, 2023**, +according to a +[Slack conversation](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK) +that took place that day). + +| Mongo collection | System component | +|---------------------------------------------|-----------------------------------------------------------------------------| +| `biosample_set` | Workflows<br/>(via "manual entry" via Runtime API) | +| `data_object_set` | Workflows<br/>(via Runtime API) | +| `mags_activity_set` | Workflows<br/>(via Runtime API) | +| `metagenome_annotation_activity_set` | Workflows<br/>(via Runtime API) | +| `metagenome_assembly_set` | Workflows<br/>(via Runtime API) | +| `read_based_taxonomy_analysis_activity_set` | Workflows<br/>(via Runtime API) | +| `read_qc_analysis_activity_set` | Workflows<br/>(via Runtime API) | +| `jobs` | Scheduler<br/>(via MongoDB directly; e.g. `pymongo`) | +| `*` | `nmdc-runtime`; i.e. Runtime API<br/>(via MongoDB directly; e.g. `pymongo`) | + diff --git a/demo/metadata_migration/notebooks/test_helpers.py b/demo/metadata_migration/notebooks/test_helpers.py new file mode 100644 index 00000000..761a6f08 --- /dev/null +++ b/demo/metadata_migration/notebooks/test_helpers.py @@ -0,0 +1,78 @@ +import unittest +from tempfile import NamedTemporaryFile as TempFile, mkdtemp +import shutil + +from demo.metadata_migration.notebooks.helpers import Config + + +class TestConfig(unittest.TestCase): + r""" + Tests targeting the `Config` class. + + You can format this file like this: + $ python -m black demo/metadata_migration/notebooks/test_helpers.py + + You can run these tests like this: + $ python -m unittest -v demo/metadata_migration/notebooks/test_helpers.py + + Reference: https://docs.python.org/3/library/unittest.html#basic-example + """ + + def test_init_method(self): + with TempFile() as notebook_config_file, TempFile() as origin_mongo_config_file, TempFile() as transformer_mongo_config_file, TempFile() as mongodump_binary, TempFile() as mongorestore_binary: + # Create named temporary directories and get their paths. + origin_dump_folder_path = mkdtemp() + transformer_dump_folder_path = mkdtemp() + + # Populate the Mongo config files, then reset their file pointers. + origin_mongo_server_uri = f"mongodb://u:p@origin:12345" + transformer_mongo_server_uri = f"mongodb://u:p@transformer:12345" + origin_mongo_yaml = f"uri: {origin_mongo_server_uri}\n" + transformer_mongo_yaml = f"uri: {transformer_mongo_server_uri}\n" + origin_mongo_config_file.write(origin_mongo_yaml.encode("utf-8")) + transformer_mongo_config_file.write(transformer_mongo_yaml.encode("utf-8")) + origin_mongo_config_file.seek(0) + transformer_mongo_config_file.seek(0) + + # Use familiar aliases in an attempt to facilitate writing the `assert` section below. + mongodump_path = mongodump_binary.name + mongorestore_path = mongorestore_binary.name + origin_mongo_config_file_path = origin_mongo_config_file.name + transformer_mongo_config_file_path = transformer_mongo_config_file.name + + # Populate the notebook config file, then reset its file pointer. + notebook_config_values = dict( + PATH_TO_ORIGIN_MONGO_CONFIG_FILE=origin_mongo_config_file_path, + PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE=transformer_mongo_config_file_path, + PATH_TO_ORIGIN_MONGO_DUMP_FOLDER=origin_dump_folder_path, + PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER=transformer_dump_folder_path, + PATH_TO_MONGODUMP_BINARY=mongodump_path, + PATH_TO_MONGORESTORE_BINARY=mongorestore_path, + ) + for key, value in notebook_config_values.items(): + notebook_config_file.write(f"{key} = {value}\n".encode("utf-8")) + notebook_config_file.seek(0) + + # Instantiate the class-under-test. + cfg = Config(notebook_config_file.name) + + # Validate the instance. + assert cfg.mongodump_path == mongodump_path + assert cfg.mongorestore_path == mongorestore_path + assert cfg.origin_dump_folder_path == origin_dump_folder_path + assert cfg.transformer_dump_folder_path == transformer_dump_folder_path + assert cfg.origin_mongo_config_file_path == origin_mongo_config_file_path + assert ( + cfg.transformer_mongo_config_file_path + == transformer_mongo_config_file_path + ) + assert cfg.origin_mongo_server_uri == origin_mongo_server_uri + assert cfg.transformer_mongo_server_uri == transformer_mongo_server_uri + + # Delete the temporary directories (i.e. clean up). + shutil.rmtree(origin_dump_folder_path) + shutil.rmtree(transformer_dump_folder_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/nmdc_runtime/api/core/metadata.py b/nmdc_runtime/api/core/metadata.py index dc23f88d..f2504bd8 100644 --- a/nmdc_runtime/api/core/metadata.py +++ b/nmdc_runtime/api/core/metadata.py @@ -664,9 +664,9 @@ def copy_docs_in_update_cmd( dissoc(d, "_id") for d in mdb_from[collection_name].find({"id": {"$in": ids}}) ] - results[ - collection_name - ] = f"{len(mdb_to[collection_name].insert_many(docs).inserted_ids)} docs inserted" + results[collection_name] = ( + f"{len(mdb_to[collection_name].insert_many(docs).inserted_ids)} docs inserted" + ) return results diff --git a/nmdc_runtime/api/v1/models/workflow_execution_activity.py b/nmdc_runtime/api/v1/models/workflow_execution_activity.py index 62a62f99..91cd3265 100644 --- a/nmdc_runtime/api/v1/models/workflow_execution_activity.py +++ b/nmdc_runtime/api/v1/models/workflow_execution_activity.py @@ -1,4 +1,5 @@ """Beans.""" + from typing import List from nmdc_runtime.workflow_execution_activity import ( diff --git a/nmdc_runtime/api/v1/users.py b/nmdc_runtime/api/v1/users.py index 2ec5f976..45b38d08 100644 --- a/nmdc_runtime/api/v1/users.py +++ b/nmdc_runtime/api/v1/users.py @@ -1,4 +1,5 @@ """Endpoints module.""" + from typing import List, Optional from fastapi import APIRouter, HTTPException, Depends, Response, status diff --git a/nmdc_runtime/api/v1/workflows/activities.py b/nmdc_runtime/api/v1/workflows/activities.py index c9971c74..1112948d 100644 --- a/nmdc_runtime/api/v1/workflows/activities.py +++ b/nmdc_runtime/api/v1/workflows/activities.py @@ -1,4 +1,5 @@ """Module.""" + import os from typing import Any diff --git a/nmdc_runtime/api/v1/workflows/activities/router.py b/nmdc_runtime/api/v1/workflows/activities/router.py index 9b5a2de6..66fed736 100644 --- a/nmdc_runtime/api/v1/workflows/activities/router.py +++ b/nmdc_runtime/api/v1/workflows/activities/router.py @@ -1,4 +1,5 @@ """Under embargo due to E999 SyntaxError""" + # """Module""" # from fastapi import APIRouter, Depends, HTTPException # from nmdc_runtime.api.models.site import Site, get_current_client_site diff --git a/nmdc_runtime/infrastructure/database/impl/mongo/db.py b/nmdc_runtime/infrastructure/database/impl/mongo/db.py index 6b3f6850..d629e99b 100644 --- a/nmdc_runtime/infrastructure/database/impl/mongo/db.py +++ b/nmdc_runtime/infrastructure/database/impl/mongo/db.py @@ -1,6 +1,7 @@ """ Database initialization """ + import os from beanie import init_beanie diff --git a/nmdc_runtime/infrastructure/database/impl/mongo/models/user.py b/nmdc_runtime/infrastructure/database/impl/mongo/models/user.py index 586d4362..c941b118 100644 --- a/nmdc_runtime/infrastructure/database/impl/mongo/models/user.py +++ b/nmdc_runtime/infrastructure/database/impl/mongo/models/user.py @@ -1,6 +1,7 @@ """ User models """ + from typing import Optional, List from beanie import Document, Indexed diff --git a/nmdc_runtime/site/export/study_metadata.py b/nmdc_runtime/site/export/study_metadata.py index ea9840b6..cdcfef8e 100644 --- a/nmdc_runtime/site/export/study_metadata.py +++ b/nmdc_runtime/site/export/study_metadata.py @@ -1,6 +1,7 @@ """ Get NMDC study-associated metadata from search api """ + import csv from io import StringIO diff --git a/nmdc_runtime/site/translation/emsl.py b/nmdc_runtime/site/translation/emsl.py index cbb64c74..b2d1491c 100644 --- a/nmdc_runtime/site/translation/emsl.py +++ b/nmdc_runtime/site/translation/emsl.py @@ -1,6 +1,7 @@ """ Translates EMSL data into JSON conformant with the NMDC JSON schema """ + from dagster import op, graph from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL diff --git a/nmdc_runtime/site/translation/jgi.py b/nmdc_runtime/site/translation/jgi.py index 88980934..29dac7b9 100644 --- a/nmdc_runtime/site/translation/jgi.py +++ b/nmdc_runtime/site/translation/jgi.py @@ -1,6 +1,7 @@ """ Translates EMSL data into JSON conformant with the NMDC JSON schema """ + from dagster import op, graph from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL diff --git a/nmdc_runtime/site/translation/neon_soil_translator.py b/nmdc_runtime/site/translation/neon_soil_translator.py index b08059dc..d371d1c7 100644 --- a/nmdc_runtime/site/translation/neon_soil_translator.py +++ b/nmdc_runtime/site/translation/neon_soil_translator.py @@ -160,11 +160,11 @@ def _translate_biosample( biosample_row["CNratio"].values[0], None ), ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]), - water_content=[ - f"{biosample_row['soilMoisture'].values[0]} g of water/g of dry soil" - ] - if not biosample_row["soilMoisture"].isna().any() - else None, + water_content=( + [f"{biosample_row['soilMoisture'].values[0]} g of water/g of dry soil"] + if not biosample_row["soilMoisture"].isna().any() + else None + ), ammonium_nitrogen=_create_quantity_value( biosample_row["kclAmmoniumNConc"].values[0], "mg/L" ), diff --git a/nmdc_runtime/site/validation/emsl.py b/nmdc_runtime/site/validation/emsl.py index 314352d2..0970460d 100644 --- a/nmdc_runtime/site/validation/emsl.py +++ b/nmdc_runtime/site/validation/emsl.py @@ -1,6 +1,7 @@ """ Validates data in the EMSL collection in the nmdc_etl_staging database. """ + from dagster import op, graph from nmdc_runtime.site.validation.util import ( preset_prod, diff --git a/nmdc_runtime/site/validation/gold.py b/nmdc_runtime/site/validation/gold.py index 29b437d6..8ceca2db 100644 --- a/nmdc_runtime/site/validation/gold.py +++ b/nmdc_runtime/site/validation/gold.py @@ -1,6 +1,7 @@ """ Validates data in the GOLD collection in the nmdc_etl_staging database. """ + from dagster import op, graph from nmdc_runtime.site.validation.util import ( preset_prod, diff --git a/nmdc_runtime/site/validation/jgi.py b/nmdc_runtime/site/validation/jgi.py index 12315695..76ff431f 100644 --- a/nmdc_runtime/site/validation/jgi.py +++ b/nmdc_runtime/site/validation/jgi.py @@ -1,6 +1,7 @@ """ Validates data in the JGI collection in the nmdc_etl_staging database. """ + from dagster import op, graph from nmdc_runtime.site.ops import local_file_to_api_object