From a04a949216c0cc07f1149e7664785bc7a8ed14f8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 5 Dec 2024 23:22:59 -0800 Subject: [PATCH 1/3] Move migration-related files into new top-level `db/migrations/` folder --- .../migrations}/notebooks/.gitignore | 0 .../migrations}/notebooks/.notebook.env.example | 0 .../migrations}/notebooks/bookkeeper.py | 0 .../migrations}/notebooks/helpers.py | 0 .../migrations}/notebooks/manual_test_bookkeeper.py | 6 +++--- .../migrations}/notebooks/migrate_10_0_0_to_10_1_4.ipynb | 0 .../migrations}/notebooks/migrate_10_3_0_to_10_4_0.ipynb | 0 .../migrations}/notebooks/migrate_10_4_0_to_10_9_1.ipynb | 0 .../migrations}/notebooks/migrate_10_9_1_to_11_0_0.ipynb | 0 .../migrations}/notebooks/migrate_11_0_0_to_11_0_1.ipynb | 0 .../migrations}/notebooks/migrate_11_0_3_to_11_1_0.ipynb | 0 .../migrations}/notebooks/migrate_7_7_2_to_7_8_0.ipynb | 0 .../migrations}/notebooks/migrate_7_8_0_to_8_0_0.ipynb | 0 .../migrations}/notebooks/migrate_8_0_0_to_8_1_2.ipynb | 0 .../migrations}/notebooks/migrate_8_1_2_to_9_0_4.ipynb | 0 .../migrations}/notebooks/migrate_9_0_4_to_9_1_0.ipynb | 0 .../migrations}/notebooks/migrate_9_1_0_to_9_2_0.ipynb | 0 .../migrations}/notebooks/migrate_9_3_2_to_10_0_0.ipynb | 0 .../migrations}/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb | 0 .../notebooks/mongosh-scripts/restore-privileges.mongo.js | 0 .../notebooks/mongosh-scripts/revoke-privileges.mongo.js | 0 .../migrations}/notebooks/requirements.txt | 0 .../migrations}/notebooks/stakeholders.md | 0 .../migrations}/notebooks/test_helpers.py | 6 +++--- 24 files changed, 6 insertions(+), 6 deletions(-) rename {demo/metadata_migration => db/migrations}/notebooks/.gitignore (100%) rename {demo/metadata_migration => db/migrations}/notebooks/.notebook.env.example (100%) rename {demo/metadata_migration => db/migrations}/notebooks/bookkeeper.py (100%) rename {demo/metadata_migration => db/migrations}/notebooks/helpers.py (100%) rename {demo/metadata_migration => db/migrations}/notebooks/manual_test_bookkeeper.py (96%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_10_0_0_to_10_1_4.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_10_3_0_to_10_4_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_10_4_0_to_10_9_1.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_10_9_1_to_11_0_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_11_0_0_to_11_0_1.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_11_0_3_to_11_1_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_7_7_2_to_7_8_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_7_8_0_to_8_0_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_8_0_0_to_8_1_2.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_8_1_2_to_9_0_4.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_9_0_4_to_9_1_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_9_1_0_to_9_2_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_9_3_2_to_10_0_0.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb (100%) rename {demo/metadata_migration => db/migrations}/notebooks/mongosh-scripts/restore-privileges.mongo.js (100%) rename {demo/metadata_migration => db/migrations}/notebooks/mongosh-scripts/revoke-privileges.mongo.js (100%) rename {demo/metadata_migration => db/migrations}/notebooks/requirements.txt (100%) rename {demo/metadata_migration => db/migrations}/notebooks/stakeholders.md (100%) rename {demo/metadata_migration => db/migrations}/notebooks/test_helpers.py (96%) diff --git a/demo/metadata_migration/notebooks/.gitignore b/db/migrations/notebooks/.gitignore similarity index 100% rename from demo/metadata_migration/notebooks/.gitignore rename to db/migrations/notebooks/.gitignore diff --git a/demo/metadata_migration/notebooks/.notebook.env.example b/db/migrations/notebooks/.notebook.env.example similarity index 100% rename from demo/metadata_migration/notebooks/.notebook.env.example rename to db/migrations/notebooks/.notebook.env.example diff --git a/demo/metadata_migration/notebooks/bookkeeper.py b/db/migrations/notebooks/bookkeeper.py similarity index 100% rename from demo/metadata_migration/notebooks/bookkeeper.py rename to db/migrations/notebooks/bookkeeper.py diff --git a/demo/metadata_migration/notebooks/helpers.py b/db/migrations/notebooks/helpers.py similarity index 100% rename from demo/metadata_migration/notebooks/helpers.py rename to db/migrations/notebooks/helpers.py diff --git a/demo/metadata_migration/notebooks/manual_test_bookkeeper.py b/db/migrations/notebooks/manual_test_bookkeeper.py similarity index 96% rename from demo/metadata_migration/notebooks/manual_test_bookkeeper.py rename to db/migrations/notebooks/manual_test_bookkeeper.py index d315223e..74745cc9 100644 --- a/demo/metadata_migration/notebooks/manual_test_bookkeeper.py +++ b/db/migrations/notebooks/manual_test_bookkeeper.py @@ -12,7 +12,7 @@ from pymongo.database import Database from nmdc_schema.migrators.migrator_base import MigratorBase -from demo.metadata_migration.notebooks.bookkeeper import Bookkeeper, MigrationEvent +from db.migrations.notebooks.bookkeeper import Bookkeeper, MigrationEvent # Consume environment variables. MONGO_HOST: str = os.getenv("MONGO_HOST", "localhost") @@ -36,7 +36,7 @@ class TestBookkeeper(unittest.TestCase): Tests targeting the `Bookkeeper` class. You can format this file like this: - $ python -m black demo/metadata_migration/notebooks/manual_test_bookkeeper.py + $ python -m black db/migrations/notebooks/manual_test_bookkeeper.py You can start up a containerized MongoDB server like this: $ docker run --rm --detach --name mongo-test-migration-bookkeeper -p 27017:27017 mongo @@ -45,7 +45,7 @@ class TestBookkeeper(unittest.TestCase): - host.docker.internal:27017 You can run these tests like this: - $ python -m unittest -v demo/metadata_migration/notebooks/manual_test_bookkeeper.py + $ python -m unittest -v db/migrations/notebooks/manual_test_bookkeeper.py Reference: https://docs.python.org/3/library/unittest.html#basic-example """ diff --git a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb b/db/migrations/notebooks/migrate_10_0_0_to_10_1_4.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb rename to db/migrations/notebooks/migrate_10_0_0_to_10_1_4.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_3_0_to_10_4_0.ipynb b/db/migrations/notebooks/migrate_10_3_0_to_10_4_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_10_3_0_to_10_4_0.ipynb rename to db/migrations/notebooks/migrate_10_3_0_to_10_4_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_4_0_to_10_9_1.ipynb b/db/migrations/notebooks/migrate_10_4_0_to_10_9_1.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_10_4_0_to_10_9_1.ipynb rename to db/migrations/notebooks/migrate_10_4_0_to_10_9_1.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb b/db/migrations/notebooks/migrate_10_9_1_to_11_0_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb rename to db/migrations/notebooks/migrate_10_9_1_to_11_0_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_11_0_0_to_11_0_1.ipynb b/db/migrations/notebooks/migrate_11_0_0_to_11_0_1.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_11_0_0_to_11_0_1.ipynb rename to db/migrations/notebooks/migrate_11_0_0_to_11_0_1.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_11_0_3_to_11_1_0.ipynb b/db/migrations/notebooks/migrate_11_0_3_to_11_1_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_11_0_3_to_11_1_0.ipynb rename to db/migrations/notebooks/migrate_11_0_3_to_11_1_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb b/db/migrations/notebooks/migrate_7_7_2_to_7_8_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_7_7_2_to_7_8_0.ipynb rename to db/migrations/notebooks/migrate_7_7_2_to_7_8_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb b/db/migrations/notebooks/migrate_7_8_0_to_8_0_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_7_8_0_to_8_0_0.ipynb rename to db/migrations/notebooks/migrate_7_8_0_to_8_0_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb b/db/migrations/notebooks/migrate_8_0_0_to_8_1_2.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_8_0_0_to_8_1_2.ipynb rename to db/migrations/notebooks/migrate_8_0_0_to_8_1_2.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb b/db/migrations/notebooks/migrate_8_1_2_to_9_0_4.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_8_1_2_to_9_0_4.ipynb rename to db/migrations/notebooks/migrate_8_1_2_to_9_0_4.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb b/db/migrations/notebooks/migrate_9_0_4_to_9_1_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb rename to db/migrations/notebooks/migrate_9_0_4_to_9_1_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb b/db/migrations/notebooks/migrate_9_1_0_to_9_2_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb rename to db/migrations/notebooks/migrate_9_1_0_to_9_2_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb b/db/migrations/notebooks/migrate_9_3_2_to_10_0_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_9_3_2_to_10_0_0.ipynb rename to db/migrations/notebooks/migrate_9_3_2_to_10_0_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb b/db/migrations/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb rename to db/migrations/notebooks/migrate_A_B_C_to_X_Y_Z.ipynb diff --git a/demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js b/db/migrations/notebooks/mongosh-scripts/restore-privileges.mongo.js similarity index 100% rename from demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js rename to db/migrations/notebooks/mongosh-scripts/restore-privileges.mongo.js diff --git a/demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js b/db/migrations/notebooks/mongosh-scripts/revoke-privileges.mongo.js similarity index 100% rename from demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js rename to db/migrations/notebooks/mongosh-scripts/revoke-privileges.mongo.js diff --git a/demo/metadata_migration/notebooks/requirements.txt b/db/migrations/notebooks/requirements.txt similarity index 100% rename from demo/metadata_migration/notebooks/requirements.txt rename to db/migrations/notebooks/requirements.txt diff --git a/demo/metadata_migration/notebooks/stakeholders.md b/db/migrations/notebooks/stakeholders.md similarity index 100% rename from demo/metadata_migration/notebooks/stakeholders.md rename to db/migrations/notebooks/stakeholders.md diff --git a/demo/metadata_migration/notebooks/test_helpers.py b/db/migrations/notebooks/test_helpers.py similarity index 96% rename from demo/metadata_migration/notebooks/test_helpers.py rename to db/migrations/notebooks/test_helpers.py index 848be095..6434e73f 100644 --- a/demo/metadata_migration/notebooks/test_helpers.py +++ b/db/migrations/notebooks/test_helpers.py @@ -2,7 +2,7 @@ from tempfile import NamedTemporaryFile as TempFile, mkdtemp import shutil -from demo.metadata_migration.notebooks.helpers import Config +from db.migrations.notebooks.helpers import Config class TestConfig(unittest.TestCase): @@ -10,10 +10,10 @@ class TestConfig(unittest.TestCase): Tests targeting the `Config` class. You can format this file like this: - $ python -m black demo/metadata_migration/notebooks/test_helpers.py + $ python -m black db/migrations/notebooks/test_helpers.py You can run these tests like this: - $ python -m unittest -v demo/metadata_migration/notebooks/test_helpers.py + $ python -m unittest -v db/migrations/notebooks/test_helpers.py Reference: https://docs.python.org/3/library/unittest.html#basic-example """ From 9c78f1fb6a545322d802c5a867af003d2144ebae Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 5 Dec 2024 23:27:36 -0800 Subject: [PATCH 2/3] Add an overview `README.md` file in each new subdirectory --- db/README.md | 7 +++++++ db/migrations/README.md | 3 +++ 2 files changed, 10 insertions(+) create mode 100644 db/README.md create mode 100644 db/migrations/README.md diff --git a/db/README.md b/db/README.md new file mode 100644 index 00000000..83195e1c --- /dev/null +++ b/db/README.md @@ -0,0 +1,7 @@ +# Database + +This directory contains files related to the MongoDB database managed by the Runtime. + +It has the following subdirectories: + +- `./migrations`: files related to migrating the MongoDB database diff --git a/db/migrations/README.md b/db/migrations/README.md new file mode 100644 index 00000000..5cf247a3 --- /dev/null +++ b/db/migrations/README.md @@ -0,0 +1,3 @@ +# Migrations + +This directory contains files related to migrating the MongoDB database between schemas. From 40008e0b7d485a2ff21696abfb4fc9845d12542c Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 5 Dec 2024 23:37:57 -0800 Subject: [PATCH 3/3] Implement migration notebook for updating from schema v11.1.0 to v11.2.0 --- .../notebooks/migrate_11_1_0_to_11_2_0.ipynb | 824 ++++++++++++++++++ 1 file changed, 824 insertions(+) create mode 100644 db/migrations/notebooks/migrate_11_1_0_to_11_2_0.ipynb diff --git a/db/migrations/notebooks/migrate_11_1_0_to_11_2_0.ipynb b/db/migrations/notebooks/migrate_11_1_0_to_11_2_0.ipynb new file mode 100644 index 00000000..bb747b0e --- /dev/null +++ b/db/migrations/notebooks/migrate_11_1_0_to_11_2_0.ipynb @@ -0,0 +1,824 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "initial_id", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "source": "# Migrate MongoDB database from `nmdc-schema` `v11.1.0` to `v11.2.0`" + }, + { + "cell_type": "markdown", + "id": "3c31d85d", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This notebook will be used to migrate the database from `nmdc-schema` `v11.1.0` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v11.1.0) November 12, 2024) to `v11.2.0` (whose target release date is December 6, 2024).\n", + "\n", + "### Heads up\n", + "\n", + "Unlike some previous migrators, this one does not \"pick and choose\" which collections it will dump. There are two reasons for this: (1) migrators no longer have a dedicated `self.agenda` dictionary that indicates all the collections involved in the migration; and (2) migrators can now create, rename, and drop collections; none of which are things that the old `self.agenda`-based system was designed to handle. So, instead of picking and choosing collections, this migrator **dumps them all.**" + ] + }, + { + "cell_type": "markdown", + "id": "f65ad4ab", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "17f351e8", + "metadata": {}, + "source": [ + "### 1. Coordinate with stakeholders.\n", + "\n", + "We will be enacting full Runtime and Database downtime for this migration. Ensure stakeholders are aware of that." + ] + }, + { + "cell_type": "markdown", + "id": "233a35c3", + "metadata": {}, + "source": [ + "### 2. Set up notebook environment.\n", + "\n", + "Here, you'll prepare an environment for running this notebook.\n", + "\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database having the name specified in the notebook configuration file).\n", + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running the following command. A MongoDB server started this way will be accessible without a username or password.\n" + ] + }, + { + "cell_type": "code", + "id": "8aee55e3", + "metadata": {}, + "source": [ + "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "6cd05ccb", + "metadata": {}, + "source": [ + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " > You can use `.notebook.env.example` as a template." + ] + }, + { + "cell_type": "markdown", + "id": "69937b18", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "fe81196a", + "metadata": {}, + "source": [ + "### Install Python packages\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook cells), then proceed to the next cell.\n", + "\n", + "##### References\n", + "\n", + "| Description | Link |\n", + "|---------------------------------------------------------------------------------|--------------------------------------------------------|\n", + "| NMDC Schema PyPI package | https://pypi.org/project/nmdc-schema |\n", + "| How to `pip install` from a Git branch
instead of PyPI | https://stackoverflow.com/a/20101940 |" + ] + }, + { + "cell_type": "code", + "id": "e25a0af308c3185b", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "scrolled": true + }, + "source": [ + "%pip install --upgrade pip\n", + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==11.2.0" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "a407c354", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "#### References\n", + "\n", + "| Description | Link |\n", + "|----------------------------------------|-------------------------------------------------------------------------------------------------------|\n", + "| Dynamically importing a Python module | [`importlib.import_module`](https://docs.python.org/3/library/importlib.html#importlib.import_module) |\n", + "| Confirming something is a Python class | [`inspect.isclass`](https://docs.python.org/3/library/inspect.html#inspect.isclass) |" + ] + }, + { + "cell_type": "code", + "id": "9e8a3ceb", + "metadata": {}, + "source": "MIGRATOR_MODULE_NAME = \"migrator_from_11_1_0_to_11_2_0\"", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "dbecd561", + "metadata": {}, + "source": [ + "# Standard library packages:\n", + "import subprocess\n", + "from typing import List\n", + "import importlib\n", + "from inspect import isclass\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from linkml.validator import Validator, ValidationReport\n", + "from linkml.validator.plugins import JsonschemaValidationPlugin\n", + "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", + "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "from linkml_runtime import SchemaView\n", + "\n", + "# First-party packages:\n", + "from helpers import Config, setup_logger, get_collection_names_from_schema, derive_schema_class_name_from_document\n", + "from bookkeeper import Bookkeeper, MigrationEvent\n", + "\n", + "# Dynamic imports:\n", + "migrator_module = importlib.import_module(f\".{MIGRATOR_MODULE_NAME}\", package=\"nmdc_schema.migrators\")\n", + "Migrator = getattr(migrator_module, \"Migrator\") # gets the class\n", + "assert isclass(Migrator), \"Failed to import Migrator class.\"" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "99b20ff4", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "id": "1eac645a", + "metadata": {}, + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path\n", + "mongosh = cfg.mongosh_path\n", + "\n", + "# Make the base CLI options for Mongo shell commands.\n", + "origin_mongo_cli_base_options = Config.make_mongo_cli_base_options(\n", + " mongo_host=cfg.origin_mongo_host,\n", + " mongo_port=cfg.origin_mongo_port,\n", + " mongo_username=cfg.origin_mongo_username,\n", + " mongo_password=cfg.origin_mongo_password,\n", + ")\n", + "transformer_mongo_cli_base_options = Config.make_mongo_cli_base_options(\n", + " mongo_host=cfg.transformer_mongo_host,\n", + " mongo_port=cfg.transformer_mongo_port,\n", + " mongo_username=cfg.transformer_mongo_username,\n", + " mongo_password=cfg.transformer_mongo_password,\n", + ")\n", + "\n", + "# Perform a sanity test of the application paths.\n", + "!{mongodump} --version\n", + "!{mongorestore} --version\n", + "!{mongosh} --version" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "68245d2b", + "metadata": {}, + "source": [ + "### Create MongoDB clients\n", + "\n", + "Create MongoDB clients you can use to access the \"origin\" and \"transformer\" MongoDB servers." + ] + }, + { + "cell_type": "code", + "id": "8e95f559", + "metadata": {}, + "source": [ + "# Mongo client for \"origin\" MongoDB server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_host, \n", + " port=int(cfg.origin_mongo_port),\n", + " username=cfg.origin_mongo_username,\n", + " password=cfg.origin_mongo_password,\n", + " directConnection=True)\n", + "\n", + "# Mongo client for \"transformer\" MongoDB server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_host, \n", + " port=int(cfg.transformer_mongo_port),\n", + " username=cfg.transformer_mongo_username,\n", + " password=cfg.transformer_mongo_password,\n", + " directConnection=True)\n", + "\n", + "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", + "with pymongo.timeout(3):\n", + " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", + " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the origin database exists.\n", + " assert cfg.origin_mongo_database_name in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", + " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the transformation database does not exist.\n", + " assert cfg.transformer_mongo_database_name not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "1e195db1", + "metadata": {}, + "source": [ + "Delete the transformer database from the transformer MongoDB server if that database already exists there (e.g. if it was left over from an experiment).\n", + "\n", + "#### References\n", + "\n", + "| Description | Link |\n", + "|------------------------------|---------------------------------------------------------------|\n", + "| Python's `subprocess` module | https://docs.python.org/3/library/subprocess.html |\n", + "| `mongosh` CLI options | https://www.mongodb.com/docs/mongodb-shell/reference/options/ |" + ] + }, + { + "cell_type": "code", + "id": "8939a2ed", + "metadata": {}, + "source": [ + "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", + "# because I expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} {transformer_mongo_cli_base_options} \\\n", + " --eval 'use {cfg.transformer_mongo_database_name}' \\\n", + " --eval 'db.dropDatabase()' \\\n", + " --quiet\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "bc387abc62686091", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Create validator\n", + "\n", + "In this step, you'll create a validator that can be used to check whether data conforms to the NMDC Schema. You'll use it later, to do that.\n", + "\n", + "#### References\n", + "\n", + "| Description | Link |\n", + "|------------------------------|------------------------------------------------------------------------------|\n", + "| LinkML's `Validator` class | https://linkml.io/linkml/code/validator.html#linkml.validator.Validator |\n", + "| Validating data using LinkML | https://linkml.io/linkml/data/validating-data.html#validation-in-python-code |" + ] + }, + { + "cell_type": "code", + "id": "5c982eb0c04e606d", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "schema_definition = get_nmdc_schema_definition()\n", + "validator = Validator(\n", + " schema=schema_definition,\n", + " validation_plugins=[JsonschemaValidationPlugin(closed=True)],\n", + ")\n", + "\n", + "# Perform a sanity test of the validator.\n", + "assert callable(validator.validate), \"Failed to instantiate a validator\"" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "e7e8befb362a1670", + "metadata": {}, + "source": [ + "### Create SchemaView\n", + "\n", + "In this step, you'll instantiate a `SchemaView` that is bound to the destination schema. \n", + "\n", + "#### References\n", + "\n", + "| Description | Link |\n", + "|-----------------------------|-----------------------------------------------------|\n", + "| LinkML's `SchemaView` class | https://linkml.io/linkml/developers/schemaview.html |" + ] + }, + { + "cell_type": "code", + "id": "625a6e7df5016677", + "metadata": {}, + "source": [ + "schema_view = SchemaView(get_nmdc_schema_definition())\n", + "\n", + "# As a sanity test, confirm we can use the `SchemaView` instance to access a schema class.\n", + "schema_view.get_class(class_name=\"Database\")[\"name\"]" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3975ac24", + "metadata": {}, + "source": [ + "### Revoke access from the \"origin\" MongoDB server\n", + "\n", + "We revoke both \"write\" and \"read\" access to the server.\n", + "\n", + "#### Rationale\n", + "\n", + "We revoke \"write\" access so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data (which would wipe out any changes made in the meantime).\n", + "\n", + "We also revoke \"read\" access. The revocation of \"read\" access is technically optional, but (a) the JavaScript mongosh script will be easier for me to maintain if it revokes everything and (b) this prevents people from reading data during the restore step, during which the database may not be self-consistent.\n", + "\n", + "#### References\n", + "\n", + "| Description | Link |\n", + "|--------------------------------|-----------------------------------------------------------|\n", + "| Running a script via `mongosh` | https://www.mongodb.com/docs/mongodb-shell/write-scripts/ |" + ] + }, + { + "cell_type": "code", + "id": "f761caad", + "metadata": {}, + "source": [ + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", + " --file='mongosh-scripts/revoke-privileges.mongo.js' \\\n", + " --quiet\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "7f9c87de6fb8530c", + "metadata": {}, + "source": [ + "### Delete obsolete dumps from previous migrations\n", + "\n", + "Delete any existing dumps before we create new ones in this notebook. This is so the dumps you generate with this notebook do not get merged with any unrelated ones." + ] + }, + { + "cell_type": "code", + "id": "6a949d0fcb4b6fa0", + "metadata": {}, + "source": [ + "!rm -rf {cfg.origin_dump_folder_path}\n", + "!rm -rf {cfg.transformer_dump_folder_path}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "b7799910b6b0715d", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" MongoDB server\n", + "\n", + "Use `mongodump` to dump all the collections **from** the \"origin\" MongoDB server **into** a local directory.\n", + "\n", + "- TODO: Consider only dumping collections represented by the initial schema." + ] + }, + { + "cell_type": "code", + "id": "da530d6754c4f6fe", + "metadata": { + "scrolled": true + }, + "source": [ + "# Dump all collections from the \"origin\" database.\n", + "shell_command = f\"\"\"\n", + " {mongodump} {origin_mongo_cli_base_options} \\\n", + " --db='{cfg.origin_mongo_database_name}' \\\n", + " --out='{cfg.origin_dump_folder_path}' \\\n", + " --gzip\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "932ebde8abdd70ec", + "metadata": {}, + "source": [ + "### Load the dumped collections into the \"transformer\" MongoDB server\n", + "\n", + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", + "\n", + "References:\n", + "- https://www.mongodb.com/docs/database-tools/mongorestore/#std-option-mongorestore\n", + "- https://www.mongodb.com/docs/database-tools/mongorestore/mongorestore-examples/#copy-clone-a-database" + ] + }, + { + "cell_type": "code", + "id": "79bd888e82d52a93", + "metadata": { + "scrolled": true + }, + "source": [ + "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", + "shell_command = f\"\"\"\n", + " {mongorestore} {transformer_mongo_cli_base_options} \\\n", + " --nsFrom='{cfg.origin_mongo_database_name}.*' \\\n", + " --nsTo='{cfg.transformer_mongo_database_name}.*' \\\n", + " --dir='{cfg.origin_dump_folder_path}' \\\n", + " --stopOnError \\\n", + " --drop \\\n", + " --gzip\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "c3e3c9c4", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" MongoDB server\n", + "\n", + "Use the migrator to transform the collections in the \"transformer\" database.\n", + "\n", + "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", + "\n", + "> Reminder: The \"origin\" database is **not** affected by this step." + ] + }, + { + "cell_type": "code", + "id": "9c89c9dd3afe64e2", + "metadata": { + "scrolled": true + }, + "source": [ + "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", + "adapter = MongoAdapter(\n", + " database=transformer_mongo_client[cfg.transformer_mongo_database_name],\n", + " on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", + " on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", + " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", + ")\n", + "\n", + "# Instantiate a Migrator bound to that adapter.\n", + "logger = setup_logger()\n", + "migrator = Migrator(adapter=adapter, logger=logger)\n", + "\n", + "# Execute the Migrator's `upgrade` method to perform the migration.\n", + "migrator.upgrade()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "4c090068", + "metadata": {}, + "source": [ + "### Validate the transformed documents\n", + "\n", + "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server." + ] + }, + { + "cell_type": "code", + "id": "e1c50b9911e02e70", + "metadata": {}, + "source": [ + "# Get the names of all collections.\n", + "collection_names: List[str] = get_collection_names_from_schema(schema_view)\n", + "\n", + "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `collection_names`,\n", + "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", + "# other collections without having to wait for that (large) collection to be validated.\n", + "ordered_collection_names = sorted(collection_names.copy())\n", + "large_collection_name = \"functional_annotation_agg\"\n", + "if large_collection_name in ordered_collection_names:\n", + " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", + " ordered_collection_names.append(large_collection_name) # puts it last\n", + "\n", + "# Process each collection.\n", + "for collection_name in ordered_collection_names:\n", + " collection = transformer_mongo_client[cfg.transformer_mongo_database_name][collection_name]\n", + " num_documents_in_collection = collection.count_documents({})\n", + " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents) [\", end=\"\") # no newline\n", + "\n", + " # Calculate how often we'll display a tick mark (i.e. a sign of life).\n", + " num_documents_per_tick = num_documents_in_collection * 0.10 # one tenth of the total\n", + " num_documents_since_last_tick = 0\n", + " \n", + " for document in collection.find():\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: The reason we don't use a progress bar library such as `rich[jupyter]`, `tqdm`, or `ipywidgets`\n", + " # is that _PyCharm's_ Jupyter Notebook integration doesn't fully work with any of them. :(\n", + " #\n", + " schema_class_name = derive_schema_class_name_from_document(schema_view=schema_view, document=document)\n", + " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", + " validation_report: ValidationReport = validator.validate(document_without_underscore_id_key, schema_class_name)\n", + " if len(validation_report.results) > 0:\n", + " result_messages = [result.message for result in validation_report.results]\n", + " raise TypeError(f\"Document is invalid.\\n{result_messages=}\\n{document_without_underscore_id_key=}\")\n", + "\n", + " # Display a tick mark if we have validated enough documents since we last displayed one.\n", + " num_documents_since_last_tick += 1\n", + " if num_documents_since_last_tick >= num_documents_per_tick:\n", + " num_documents_since_last_tick = 0\n", + " print(\".\", end=\"\") # no newline\n", + " \n", + " print(\"]\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3edf77c7", + "metadata": {}, + "source": [ + "### Dump the collections from the \"transformer\" MongoDB server\n", + "\n", + "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + ] + }, + { + "cell_type": "code", + "id": "db6e432d", + "metadata": { + "scrolled": true + }, + "source": [ + "# Dump the database from the \"transformer\" MongoDB server.\n", + "shell_command = f\"\"\"\n", + " {mongodump} {transformer_mongo_cli_base_options} \\\n", + " --db='{cfg.transformer_mongo_database_name}' \\\n", + " --out='{cfg.transformer_dump_folder_path}' \\\n", + " --gzip\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\") " + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "997fcb281d9d3222", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Create a bookkeeper\n", + "\n", + "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." + ] + }, + { + "cell_type": "code", + "id": "dbbe706d", + "metadata": {}, + "source": [ + "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "1e0c8891", + "metadata": {}, + "source": [ + "### Indicate — on the \"origin\" server — that the migration is underway\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration has started." + ] + }, + { + "cell_type": "code", + "id": "ca49f61a", + "metadata": {}, + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "9c253e6f", + "metadata": {}, + "source": [ + "### Drop the original collections from the \"origin\" MongoDB server\n", + "\n", + "This is necessary for situations where collections were renamed or deleted. (The `--drop` option of `mongorestore` would only drop collections that exist in the dump being restored, which would not include renamed or deleted collections.)" + ] + }, + { + "cell_type": "code", + "id": "0b26e434", + "metadata": {}, + "source": [ + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", + " --eval 'use {cfg.origin_mongo_database_name}' \\\n", + " --eval 'db.dropDatabase()'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "d84bdc11", + "metadata": {}, + "source": [ + "### Load the collections into the \"origin\" MongoDB server\n", + "\n", + "Load the transformed collections into the \"origin\" MongoDB server." + ] + }, + { + "cell_type": "code", + "id": "1dfbcf0a", + "metadata": { + "scrolled": true + }, + "source": [ + "# Load the transformed collections into the origin server, replacing any same-named ones that are there.\n", + "shell_command = f\"\"\"\n", + " {mongorestore} {origin_mongo_cli_base_options} \\\n", + " --nsFrom='{cfg.transformer_mongo_database_name}.*' \\\n", + " --nsTo='{cfg.origin_mongo_database_name}.*' \\\n", + " --dir='{cfg.transformer_dump_folder_path}' \\\n", + " --stopOnError \\\n", + " --verbose \\\n", + " --drop \\\n", + " --gzip\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\") " + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "ca5ee89a79148499", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Indicate that the migration is complete\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration is complete." + ] + }, + { + "cell_type": "code", + "id": "d1eaa6c92789c4f3", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "04c856a8", + "metadata": {}, + "source": [ + "### Restore access to the \"origin\" MongoDB server\n", + "\n", + "This effectively un-does the access revocation that we did earlier." + ] + }, + { + "cell_type": "code", + "id": "9aab3c7e", + "metadata": {}, + "source": [ + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", + " --file='mongosh-scripts/restore-privileges.mongo.js' \\\n", + " --quiet\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}