From 4e51558a9bded88562bd6a9e1b712312bcf693a3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 May 2024 15:27:29 -0700 Subject: [PATCH 01/27] WIP: Draft preliminary notebook that runs all Berkeley migrators --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 787 ++++++++++++++++++ 1 file changed, 787 insertions(+) create mode 100644 demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb new file mode 100644 index 00000000..afb1313a --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -0,0 +1,787 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "source": [ + "# Migrate MongoDB database from `nmdc-schema` `v10.2.0` to `v11.0.0`" + ] + }, + { + "cell_type": "markdown", + "id": "3c31d85d", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This notebook is special. Unlike all previous notebooks, each of which only used a single migrator; this notebook will be using multiple migrators.\n", + "\n", + "This notebook will be used to migrate the database from `v10.2.0` (i.e. the final version of the pre-Berkeley schema) to `v11.0.0` (i.e. the initial version of the Berkeley schema)." + ] + }, + { + "cell_type": "markdown", + "id": "f65ad4ab", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "37d358ba", + "metadata": {}, + "source": [ + "### 1. Determine MongoDB collections involved.\n", + "\n", + "To determine this, we look at all the migrators listed in this \"meta issue\": https://github.com/microbiomedata/nmdc-schema/issues/1607. In each migrator, we make note of which collections are involved (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", + "\n", + "```py\n", + "# TODO: Consider separating them into two lists: `COLLECTIONS_TO_DUMP` and `COLLECTIONS_TO_RESTORE`.\n", + "```" + ] + }, + { + "cell_type": "code", + "id": "09966b0d", + "metadata": {}, + "source": [ + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR23.py\n", + "from_10_2_0_to_PR23 = [\n", + " \"metagenome_assembly_set\",\n", + " \"metagenome_annotation_activity_set\",\n", + " \"metatranscriptome_activity_set\",\n", + " \"mags_activity_set\",\n", + " \"metagenome_sequencing_activity_set\",\n", + " \"read_qc_analysis_activity_set\",\n", + " \"read_based_taxonomy_analysis_activity_set\",\n", + " \"metabolomics_analysis_activity_set\",\n", + " \"metaproteomics_analysis_activity_set\",\n", + " \"nom_analysis_activity_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR4.py\n", + "from_PR23_to_PR4 = [\n", + " \"omics_processing_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR53.py\n", + "from_PR4_to_PR53 = [\n", + " \"omics_processing_set\",\n", + " \"biosample_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR21.py\n", + "from_PR53_to_PR21 = [\n", + " \"study_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR129.py\n", + "from_PR21_to_PR129 = [\n", + " \"metabolomics_analysis_activity_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR31.py\n", + "from_PR129_to_PR31 = [\n", + " \"mags_activity_set\",\n", + " \"metabolomics_analysis_activity_set\",\n", + " \"metagenome_annotation_activity_set\",\n", + " \"metagenome_assembly_set\",\n", + " \"metagenome_sequencing_activity_set\",\n", + " \"metatranscriptome_activity_set\",\n", + " \"nom_analysis_activity_set\",\n", + " \"omics_processing_set\",\n", + " \"read_based_taxonomy_analysis_activity_set\",\n", + " \"read_qc_analysis_activity_set\"\n", + " \"metaproteomics_analysis_activity_set\"\n", + "]\n", + "\n", + "# TODO: Ensure this accounts for the collection names in the _final_ version\n", + "# of the `nmdc_schema/migrators/migrator_from_X_to_PR9.py` migrator.\n", + "# See: https://github.com/microbiomedata/berkeley-schema-fy24/pull/127\n", + "from_PR31_to_PR9 = [\n", + " \"metagenome_sequencing_activity_set\",\n", + " \"read_qc_analysis_activity_set\",\n", + " \"metagenome_assembly_set\",\n", + " \"read_based_taxonomy_analysis_activity_set\",\n", + " \"metagenome_annotation_activity_set\",\n", + " \"mags_activity_set\",\n", + " \"metabolomics_analysis_activity_set\",\n", + " \"nom_analysis_activity_set\",\n", + " \"metatranscriptome_activity_set\",\n", + " \"metaproteomics_analysis_activity_set\",\n", + "\n", + " \"omics_processing_set\",\n", + " \"workflow_chain_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR19_and_PR70.py\n", + "from_PR9_to_PR19_PR70 = [\n", + " \"instrument_set\",\n", + " \"omics_processing_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR2_and_PR24.py\n", + "from_PR19_PR70_to_PR2_PR24 = [\n", + " \"omics_processing_set\", \n", + " \"data_generation_set\",\n", + "\n", + " \"mags_activity_set\", \n", + " \"mags_set\",\n", + " \n", + " \"metabolomics_analysis_activity_set\", \n", + " \"metabolomics_analysis_set\",\n", + " \n", + " \"metagenome_annotation_activity_set\", \n", + " \"metagenome_annotation_set\",\n", + " \n", + " \"metagenome_sequencing_activity_set\", \n", + " \"metagenome_sequencing_set\",\n", + " \n", + " \"metaproteomics_analysis_activity_set\", \n", + " \"metaproteomics_analysis_set\",\n", + " \n", + " \"metatranscriptome_activity_set\",\n", + " \"metatranscriptome_analysis_set\",\n", + " \n", + " \"nom_analysis_activity_set\",\n", + " \"nom_analysis_set\",\n", + " \n", + " \"read_based_taxonomy_analysis_activity_set\",\n", + " \"read_based_taxonomy_analysis_set\",\n", + " \n", + " \"read_qc_analysis_activity_set\",\n", + " \"read_qc_analysis_set\",\n", + " \n", + " \"activity_set\",\n", + " \"workflow_execution_set\" \n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR10.py\n", + "from_PR2_PR24_to_PR10 = [\n", + " \"biosample_set\",\n", + " \"data_object_set\",\n", + " \"functional_annotation_agg\",\n", + " \"study_set\",\n", + " \"extraction_set\",\n", + " \"field_research_site_set\",\n", + " \"library_preparation_set\",\n", + " \"mags_set\",\n", + " \"metabolomics_analysis_set\",\n", + " \"metagenome_annotation_set\",\n", + " \"metagenome_assembly_set\",\n", + " \"metagenome_sequencing_set\",\n", + " \"metaproteomics_analysis_set\",\n", + " \"metatranscriptome_analysis_set\",\n", + " \"nom_analysis_set\",\n", + " \"data_generation_set\",\n", + " \"pooling_set\",\n", + " \"processed_sample_set\",\n", + " \"read_based_taxonomy_analysis_set\",\n", + " \"read_qc_analysis_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR3.py\n", + "from_PR10_to_PR3 = [\n", + " \"data_generation_set\",\n", + "]\n", + "\n", + "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", + "COLLECTION_NAMES: list[str] = [\n", + " *from_10_2_0_to_PR23,\n", + " *from_PR23_to_PR4,\n", + " *from_PR4_to_PR53,\n", + " *from_PR53_to_PR21,\n", + " *from_PR21_to_PR129,\n", + " *from_PR129_to_PR31,\n", + " *from_PR31_to_PR9,\n", + " *from_PR9_to_PR19_PR70,\n", + " *from_PR19_PR70_to_PR2_PR24,\n", + " *from_PR2_PR24_to_PR10,\n", + " *from_PR10_to_PR3,\n", + "]\n", + "print(str(len(COLLECTION_NAMES)) + \" collection names\")\n", + "\n", + "# Eliminate duplicates.\n", + "COLLECTION_NAMES = list(set(COLLECTION_NAMES))\n", + "print(str(len(COLLECTION_NAMES)) + \" collection names (distinct)\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "17f351e8", + "metadata": {}, + "source": [ + "### 2. Coordinate with stakeholders.\n", + "\n", + "We will be enacting full Runtime and Database downtime for this migration. Ensure stakeholders are aware of that." + ] + }, + { + "cell_type": "markdown", + "id": "233a35c3", + "metadata": {}, + "source": [ + "### 3. Set up environment.\n", + "\n", + "Here, you'll prepare an environment for running this notebook.\n", + "\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4\n", + " ```\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use `.notebook.env.example` as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate the two **MongoDB configuration files** that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", + " 1. You can use `.mongo.yaml.example` as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", + "\n", + "```py\n", + "# TODO: On the Berkeley Mongo server, create a Mongo user (e.g. `app.nmdc-migrator`) having the new `nmdc_migrator` role; \n", + "# then update `.mongo.origin.yaml` so it contains that user's credentials.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "69937b18", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "fe81196a", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook cells) now.\n", + "\n", + "References: \n", + "- https://pypi.org/project/nmdc-schema/\n", + "- https://github.com/microbiomedata/berkeley-schema-fy24\n", + "- How to `pip install` a Git branch: https://stackoverflow.com/a/20101940" + ] + }, + { + "cell_type": "code", + "id": "e25a0af308c3185b", + "metadata": { + "collapsed": false + }, + "source": [ + "%pip install -r requirements.txt\n", + "%pip install git+https://github.com/microbiomedata/berkeley-schema-fy24@eecavanna/temp-all-migrators-2" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "a407c354", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "```py\n", + "# TODO: Import three more migrators, once they become available in a published `nmdc-schema` package.\n", + "```" + ] + }, + { + "cell_type": "code", + "id": "dbecd561", + "metadata": {}, + "source": [ + "# Third-party packages:\n", + "import pymongo\n", + "from jsonschema import Draft7Validator\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR23 import Migrator as M1\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR4 import Migrator as M2\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR53 import Migrator as M3\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR21 import Migrator as M4\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR129 import Migrator as M5\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR31 import Migrator as M6\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M7\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR19_and_PR70 import Migrator as M8\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M9\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M10\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M11\n", + "\n", + "# First-party packages:\n", + "from helpers import Config\n", + "from bookkeeper import Bookkeeper, MigrationEvent" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "99b20ff4", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "id": "1eac645a", + "metadata": {}, + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path\n", + "\n", + "# Perform a sanity test of the application paths.\n", + "!{mongodump} --version\n", + "!{mongorestore} --version" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "68245d2b", + "metadata": {}, + "source": [ + "### Create MongoDB clients\n", + "\n", + "Create MongoDB clients you can use to access the \"origin\" and \"transformer\" MongoDB servers." + ] + }, + { + "cell_type": "code", + "id": "8e95f559", + "metadata": {}, + "source": [ + "# Mongo client for \"origin\" MongoDB server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for \"transformer\" MongoDB server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "\n", + "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", + "with pymongo.timeout(3):\n", + " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", + " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the origin database exists.\n", + " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", + " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the transformation database does not exist.\n", + " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "bc387abc62686091", + "metadata": { + "collapsed": false + }, + "source": [ + "### Create a bookkeeper\n", + "\n", + "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." + ] + }, + { + "cell_type": "code", + "id": "5c982eb0c04e606d", + "metadata": { + "collapsed": false + }, + "source": [ + "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3975ac24", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "id": "9e2dbb92", + "metadata": {}, + "source": [ + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema) # Is there a newer validator class available; e.g. draft 2019?\n", + "\n", + "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "fd4994a0", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" MongoDB server\n", + "\n", + "Use `mongodump` to dump the collections involved in this migration **from** the \"origin\" MongoDB server **into** a local directory.\n", + "\n", + "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved." + ] + }, + { + "cell_type": "code", + "id": "cf8fa1ca", + "metadata": {}, + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in COLLECTION_NAMES]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "print(exclusion_options_str)\n", + "\n", + "# Dump the not-excluded collections from the \"origin\" database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "c3e3c9c4", + "metadata": {}, + "source": [ + "### Load the dumped collections into the \"transformer\" MongoDB server\n", + "\n", + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", + "\n", + "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database." + ] + }, + { + "cell_type": "code", + "id": "418571c5", + "metadata": {}, + "source": [ + "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "print(inclusion_options_str)\n", + "\n", + "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --stopOnError \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "4c090068", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" MongoDB server\n", + "\n", + "Use the migrator to transform the collections in the \"transformer\" database.\n", + "\n", + "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", + "\n", + "> Reminder: The \"origin\" database is **not** affected by this step." + ] + }, + { + "cell_type": "code", + "id": "05869340", + "metadata": {}, + "source": [ + "from datetime import datetime\n", + "\n", + "def get_timestamp():\n", + " return datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", + "\n", + "\n", + "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", + "adapter = MongoAdapter(\n", + " database=transformer_mongo_client[\"nmdc\"],\n", + " on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", + " on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", + " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", + ")\n", + "\n", + "# Instantiate Migrators bound to that adapter.\n", + "migrator1 = M1(adapter=adapter)\n", + "migrator2 = M2(adapter=adapter)\n", + "migrator3 = M3(adapter=adapter)\n", + "migrator4 = M4(adapter=adapter)\n", + "migrator5 = M5(adapter=adapter)\n", + "migrator6 = M6(adapter=adapter)\n", + "migrator7 = M7(adapter=adapter)\n", + "migrator8 = M8(adapter=adapter)\n", + "migrator9 = M9(adapter=adapter)\n", + "migrator10 = M10(adapter=adapter)\n", + "migrator11 = M11(adapter=adapter)\n", + "\n", + "# Execute the Migrator's `upgrade` method to perform the migration.\n", + "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", + "migrator1.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator2.upgrade()\")\n", + "migrator2.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator3.upgrade()\")\n", + "migrator3.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator4.upgrade()\")\n", + "migrator4.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator5.upgrade()\")\n", + "migrator5.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator6.upgrade()\")\n", + "migrator6.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator7.upgrade()\")\n", + "migrator7.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator8.upgrade()\")\n", + "migrator8.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator9.upgrade()\")\n", + "migrator9.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator10.upgrade()\")\n", + "migrator10.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator11.upgrade()\")\n", + "migrator11.upgrade()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3edf77c7", + "metadata": {}, + "source": [ + "### Validate the transformed documents\n", + "\n", + "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server.\n", + "\n", + "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" + ] + }, + { + "cell_type": "code", + "id": "db6e432d", + "metadata": {}, + "source": [ + "for collection_name in COLLECTION_NAMES:\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for document in collection.find():\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "997fcb281d9d3222", + "metadata": { + "collapsed": false + }, + "source": [ + "### Indicate that the migration is underway\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration has started." + ] + }, + { + "cell_type": "code", + "id": "fcafd862e1becb98", + "metadata": { + "collapsed": false + }, + "source": [ + "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "1e0c8891", + "metadata": {}, + "source": [ + "### Dump the collections from the \"transformer\" MongoDB server\n", + "\n", + "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + ] + }, + { + "cell_type": "code", + "id": "ca49f61a", + "metadata": {}, + "source": [ + "# Dump the database from the \"transformer\" MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "d84bdc11", + "metadata": {}, + "source": [ + "### Load the collections into the \"origin\" MongoDB server\n", + "\n", + "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", + "\n", + "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." + ] + }, + { + "cell_type": "code", + "id": "1dfbcf0a", + "metadata": {}, + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --stopOnError \\\n", + " --preserveUUID \\\n", + " {inclusion_options_str}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "ca5ee89a79148499", + "metadata": { + "collapsed": false + }, + "source": [ + "### Indicate that the migration is complete\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration is complete." + ] + }, + { + "cell_type": "code", + "id": "d1eaa6c92789c4f3", + "metadata": { + "collapsed": false + }, + "source": [ + "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)" + ], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 86085d59fdeaba4fa57570b788a650545320a378 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 25 May 2024 23:35:50 -0700 Subject: [PATCH 02/27] Load schema `11.0.0rc4` from PyPI and add twelfth migrator --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index afb1313a..0a22ec3d 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -49,6 +49,14 @@ "id": "09966b0d", "metadata": {}, "source": [ + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_unknown.py\n", + "from_X_to_unknown = [\n", + " \"omics_processing_set\",\n", + " \"pooling_set\",\n", + " \"library_preparation_set\",\n", + " \"extraction_set\"\n", + "]\n", + "\n", "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR23.py\n", "from_10_2_0_to_PR23 = [\n", " \"metagenome_assembly_set\",\n", @@ -191,6 +199,7 @@ "\n", "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", "COLLECTION_NAMES: list[str] = [\n", + " *from_X_to_unknown,\n", " *from_10_2_0_to_PR23,\n", " *from_PR23_to_PR4,\n", " *from_PR4_to_PR53,\n", @@ -250,8 +259,7 @@ " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", "\n", "```py\n", - "# TODO: On the Berkeley Mongo server, create a Mongo user (e.g. `app.nmdc-migrator`) having the new `nmdc_migrator` role; \n", - "# then update `.mongo.origin.yaml` so it contains that user's credentials.\n", + "# TODO: Update the `.mongo.origin.yaml` file to use the `nmdc_migrator` user on the Berkeley Mongo server.\n", "```" ] }, @@ -288,7 +296,7 @@ }, "source": [ "%pip install -r requirements.txt\n", - "%pip install git+https://github.com/microbiomedata/berkeley-schema-fy24@eecavanna/temp-all-migrators-2" + "%pip install nmdc-schema==11.0.0rc4" ], "outputs": [], "execution_count": null @@ -300,11 +308,7 @@ "source": [ "### Import Python dependencies\n", "\n", - "Import the Python objects upon which this notebook depends.\n", - "\n", - "```py\n", - "# TODO: Import three more migrators, once they become available in a published `nmdc-schema` package.\n", - "```" + "Import the Python objects upon which this notebook depends." ] }, { @@ -318,17 +322,18 @@ "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR23 import Migrator as M1\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR4 import Migrator as M2\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR53 import Migrator as M3\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR21 import Migrator as M4\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR129 import Migrator as M5\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR31 import Migrator as M6\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M7\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR19_and_PR70 import Migrator as M8\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M9\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M10\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M11\n", + "from nmdc_schema.migrators.migrator_from_X_to_unknown import Migrator as M1\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR23 import Migrator as M2\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR4 import Migrator as M3\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR53 import Migrator as M4\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR21 import Migrator as M5\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR129 import Migrator as M6\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR31 import Migrator as M7\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M8\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR19_and_PR70 import Migrator as M9\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M10\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M11\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M12\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -499,7 +504,9 @@ "\n", "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", "\n", - "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database." + "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database.\n", + "\n", + "> Note: This step typically takes 3 minutes (on a MacBook Pro M1, when running MongoDB in a Docker container)." ] }, { @@ -570,6 +577,7 @@ "migrator9 = M9(adapter=adapter)\n", "migrator10 = M10(adapter=adapter)\n", "migrator11 = M11(adapter=adapter)\n", + "migrator12 = M12(adapter=adapter)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", @@ -593,7 +601,9 @@ "print(f\"[{get_timestamp()}] Calling migrator10.upgrade()\")\n", "migrator10.upgrade()\n", "print(f\"[{get_timestamp()}] Calling migrator11.upgrade()\")\n", - "migrator11.upgrade()" + "migrator11.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator12.upgrade()\")\n", + "migrator12.upgrade()" ], "outputs": [], "execution_count": null @@ -666,7 +676,8 @@ "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)" ], "outputs": [], "execution_count": null @@ -757,7 +768,8 @@ "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)" ], "outputs": [], "execution_count": null From ae90ac0d891cbb6ddd7718ad4393291922589a89 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 31 May 2024 13:46:06 -0700 Subject: [PATCH 03/27] Append `migrator_from_X_to_PR176.py` migrator to sequence --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 120 ++++++++++-------- 1 file changed, 69 insertions(+), 51 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 0a22ec3d..efae3c7b 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -46,8 +46,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "09966b0d", "metadata": {}, + "outputs": [], "source": [ "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_unknown.py\n", "from_X_to_unknown = [\n", @@ -197,6 +199,11 @@ " \"data_generation_set\",\n", "]\n", "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR176.py\n", + "from_X_to_PR176 = [\n", + " \"read_qc_analysis_set\",\n", + "]\n", + "\n", "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", "COLLECTION_NAMES: list[str] = [\n", " *from_X_to_unknown,\n", @@ -211,15 +218,14 @@ " *from_PR19_PR70_to_PR2_PR24,\n", " *from_PR2_PR24_to_PR10,\n", " *from_PR10_to_PR3,\n", + " *from_X_to_PR176,\n", "]\n", "print(str(len(COLLECTION_NAMES)) + \" collection names\")\n", "\n", "# Eliminate duplicates.\n", "COLLECTION_NAMES = list(set(COLLECTION_NAMES))\n", "print(str(len(COLLECTION_NAMES)) + \" collection names (distinct)\")" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -259,7 +265,7 @@ " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", "\n", "```py\n", - "# TODO: Update the `.mongo.origin.yaml` file to use the `nmdc_migrator` user on the Berkeley Mongo server.\n", + "# TODO: On the Berkeley Mongo server, create the migrator Mongo user that has the new `nmdc-migrator` role; then update `.mongo.origin.yaml`.\n", "```" ] }, @@ -290,16 +296,17 @@ }, { "cell_type": "code", + "execution_count": null, "id": "e25a0af308c3185b", "metadata": { "collapsed": false }, + "outputs": [], "source": [ + "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc4" - ], - "outputs": [], - "execution_count": null + "%pip install nmdc-schema==11.0.0rc5" + ] }, { "cell_type": "markdown", @@ -313,8 +320,15 @@ }, { "cell_type": "code", + "execution_count": null, "id": "dbecd561", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-05T00:46:18.764498Z", + "start_time": "2024-03-05T00:46:18.202997Z" + } + }, + "outputs": [], "source": [ "# Third-party packages:\n", "import pymongo\n", @@ -334,13 +348,12 @@ "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M10\n", "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M11\n", "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M12\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR176 import Migrator as M13\n", "\n", "# First-party packages:\n", "from helpers import Config\n", "from bookkeeper import Bookkeeper, MigrationEvent" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -354,8 +367,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "1eac645a", "metadata": {}, + "outputs": [], "source": [ "cfg = Config()\n", "\n", @@ -366,9 +381,7 @@ "# Perform a sanity test of the application paths.\n", "!{mongodump} --version\n", "!{mongorestore} --version" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -382,8 +395,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "8e95f559", "metadata": {}, + "outputs": [], "source": [ "# Mongo client for \"origin\" MongoDB server.\n", "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", @@ -404,9 +419,7 @@ "\n", " # Sanity test: Ensure the transformation database does not exist.\n", " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -422,15 +435,15 @@ }, { "cell_type": "code", + "execution_count": null, "id": "5c982eb0c04e606d", "metadata": { "collapsed": false }, + "outputs": [], "source": [ "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -444,8 +457,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "9e2dbb92", "metadata": {}, + "outputs": [], "source": [ "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema) # Is there a newer validator class available; e.g. draft 2019?\n", @@ -456,9 +471,7 @@ "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -474,8 +487,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "cf8fa1ca", "metadata": {}, + "outputs": [], "source": [ "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", @@ -491,9 +506,7 @@ " --gzip \\\n", " --out=\"{cfg.origin_dump_folder_path}\" \\\n", " {exclusion_options_str}" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -511,8 +524,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "418571c5", "metadata": {}, + "outputs": [], "source": [ "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", @@ -528,9 +543,7 @@ " --stopOnError \\\n", " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", " {inclusion_options_str}" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -548,8 +561,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "05869340", "metadata": {}, + "outputs": [], "source": [ "from datetime import datetime\n", "\n", @@ -578,6 +593,7 @@ "migrator10 = M10(adapter=adapter)\n", "migrator11 = M11(adapter=adapter)\n", "migrator12 = M12(adapter=adapter)\n", + "migrator13 = M13(adapter=adapter)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", @@ -603,10 +619,10 @@ "print(f\"[{get_timestamp()}] Calling migrator11.upgrade()\")\n", "migrator11.upgrade()\n", "print(f\"[{get_timestamp()}] Calling migrator12.upgrade()\")\n", - "migrator12.upgrade()" - ], - "outputs": [], - "execution_count": null + "migrator12.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator13.upgrade()\")\n", + "migrator13.upgrade()" + ] }, { "cell_type": "markdown", @@ -622,8 +638,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "db6e432d", "metadata": {}, + "outputs": [], "source": [ "for collection_name in COLLECTION_NAMES:\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", @@ -643,9 +661,7 @@ " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -661,10 +677,12 @@ }, { "cell_type": "code", + "execution_count": null, "id": "fcafd862e1becb98", "metadata": { "collapsed": false }, + "outputs": [], "source": [ "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_STARTED)\n", @@ -677,10 +695,9 @@ "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)" - ], - "outputs": [], - "execution_count": null + "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)" + ] }, { "cell_type": "markdown", @@ -694,8 +711,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "ca49f61a", "metadata": {}, + "outputs": [], "source": [ "# Dump the database from the \"transformer\" MongoDB server.\n", "!{mongodump} \\\n", @@ -704,9 +723,7 @@ " --gzip \\\n", " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", " {exclusion_options_str}" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -722,8 +739,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "1dfbcf0a", "metadata": {}, + "outputs": [], "source": [ "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", "!{mongorestore} \\\n", @@ -735,9 +754,7 @@ " --stopOnError \\\n", " --preserveUUID \\\n", " {inclusion_options_str}" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -753,10 +770,12 @@ }, { "cell_type": "code", + "execution_count": null, "id": "d1eaa6c92789c4f3", "metadata": { "collapsed": false }, + "outputs": [], "source": [ "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_COMPLETED)\n", @@ -769,10 +788,9 @@ "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)" - ], - "outputs": [], - "execution_count": null + "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_COMPLETED)" + ] } ], "metadata": { From 83198c21c2668857db7445d7338e85c7de6c18fe Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 6 Jun 2024 18:29:21 -0700 Subject: [PATCH 04/27] Append migrator and sync `pymongo` version with `nmdc-schema`'s --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 72 ++++++++++++++++--- .../notebooks/requirements.txt | 2 +- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index efae3c7b..7ae99e95 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -204,6 +204,14 @@ " \"read_qc_analysis_set\",\n", "]\n", "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR176_to_PR104.py\n", + "from_PR176_to_PR104 = [\n", + " \"data_generation_set\"\n", + "]\n", + "\n", + "# TODO: Populate this.\n", + "#from_PR176_to_PR104 = [],\n", + "\n", "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", "COLLECTION_NAMES: list[str] = [\n", " *from_X_to_unknown,\n", @@ -219,6 +227,8 @@ " *from_PR2_PR24_to_PR10,\n", " *from_PR10_to_PR3,\n", " *from_X_to_PR176,\n", + " *from_PR176_to_PR104,\n", + " #*from_PR104_to_PR195,\n", "]\n", "print(str(len(COLLECTION_NAMES)) + \" collection names\")\n", "\n", @@ -305,7 +315,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc5" + "%pip install nmdc-schema==11.0.0rc8" ] }, { @@ -322,12 +332,7 @@ "cell_type": "code", "execution_count": null, "id": "dbecd561", - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-05T00:46:18.764498Z", - "start_time": "2024-03-05T00:46:18.202997Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Third-party packages:\n", @@ -349,6 +354,7 @@ "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M11\n", "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M12\n", "from nmdc_schema.migrators.migrator_from_X_to_PR176 import Migrator as M13\n", + "from nmdc_schema.migrators.migrator_from_PR176_to_PR104 import Migrator as M14\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -473,6 +479,18 @@ "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" ] }, + { + "cell_type": "markdown", + "id": "74e2960b", + "metadata": {}, + "source": [ + "### TODO: Revoke write access to the \"origin\" MongoDB server\n", + "\n", + "This is so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data.\n", + "\n", + "Note: The migrator Mongo user may need additional permissions in order to manipulate Mongo user roles to the extent necessary to accomplish this step." + ] + }, { "cell_type": "markdown", "id": "fd4994a0", @@ -482,7 +500,9 @@ "\n", "Use `mongodump` to dump the collections involved in this migration **from** the \"origin\" MongoDB server **into** a local directory.\n", "\n", - "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved." + "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved.\n", + "\n", + "- TODO: Consider ensuring that the local dump target folder is empty before doing this dump." ] }, { @@ -594,6 +614,7 @@ "migrator11 = M11(adapter=adapter)\n", "migrator12 = M12(adapter=adapter)\n", "migrator13 = M13(adapter=adapter)\n", + "migrator14 = M14(adapter=adapter)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", @@ -621,7 +642,9 @@ "print(f\"[{get_timestamp()}] Calling migrator12.upgrade()\")\n", "migrator12.upgrade()\n", "print(f\"[{get_timestamp()}] Calling migrator13.upgrade()\")\n", - "migrator13.upgrade()" + "migrator13.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator14.upgrade()\")\n", + "migrator14.upgrade()" ] }, { @@ -643,7 +666,12 @@ "metadata": {}, "outputs": [], "source": [ + "# FIXME: Only validate documents in the collections that we will be restoring.\n", + "# Note: If a collection named in `COLLECTION_NAMES` doesn't exist anymore,\n", + "# I think the inner `for` loop will just have zero iterations.\n", + "\n", "for collection_name in COLLECTION_NAMES:\n", + " print(collection_name)\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", " for document in collection.find():\n", " # Validate the transformed document.\n", @@ -696,7 +724,8 @@ "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_STARTED)" ] }, { @@ -725,6 +754,16 @@ " {exclusion_options_str}" ] }, + { + "cell_type": "markdown", + "id": "9c253e6f", + "metadata": {}, + "source": [ + "### TODO: Drop the original collections from the \"origin\" MongoDB server\n", + "\n", + "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump. We may need `mongosh` for this." + ] + }, { "cell_type": "markdown", "id": "d84bdc11", @@ -789,7 +828,18 @@ "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_COMPLETED)" + ] + }, + { + "cell_type": "markdown", + "id": "04c856a8", + "metadata": {}, + "source": [ + "### TODO: Reinstate write access to the MongoDB server\n", + "\n", + "This effectively un-does the access revocation that we did earlier." ] } ], diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 4125ed9f..0096244c 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -1,5 +1,5 @@ dictdiffer==0.9.0 jsonschema==4.19.2 -pymongo==4.5.0 +pymongo==4.7.2 python-dotenv==1.0.0 PyYAML==6.0.1 \ No newline at end of file From 22427f8463aa64ecdc6082c2cd614ea9e6274fb7 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 12 Jun 2024 15:43:52 -0700 Subject: [PATCH 05/27] Append additional migrators and use schema `v11.0.0rc11` --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 223 +++++++++++------- 1 file changed, 143 insertions(+), 80 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 7ae99e95..b7a572ea 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -109,9 +109,7 @@ " \"metaproteomics_analysis_activity_set\"\n", "]\n", "\n", - "# TODO: Ensure this accounts for the collection names in the _final_ version\n", - "# of the `nmdc_schema/migrators/migrator_from_X_to_PR9.py` migrator.\n", - "# See: https://github.com/microbiomedata/berkeley-schema-fy24/pull/127\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR9.py\n", "from_PR31_to_PR9 = [\n", " \"metagenome_sequencing_activity_set\",\n", " \"read_qc_analysis_activity_set\",\n", @@ -209,8 +207,50 @@ " \"data_generation_set\"\n", "]\n", "\n", - "# TODO: Populate this.\n", - "#from_PR176_to_PR104 = [],\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR192.py\n", + "from_X_to_PR192 = [\n", + " \"extraction_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR104_to_PR195.py\n", + "from_PR104_to_PR195 = [\n", + " \"collecting_biosamples_from_site_set\",\n", + " \"protocol_execution_set\",\n", + " \"storage_process_set\",\n", + " \"material_processing_set\",\n", + " \"pooling_set\",\n", + " \"extraction_set\",\n", + " \"library_preparation_set\",\n", + " \"sub_sampling_process_set\",\n", + " \"mixing_process_set\",\n", + " \"filtration_process_set\",\n", + " \"chromatographic_separation_process_set\",\n", + " \"dissolving_process_set\",\n", + " \"chemical_conversion_process_set\",\n", + " \"data_generation_set\",\n", + " \"nucleotide_sequencing_set\",\n", + " \"mass_spectrometry_set\",\n", + " \"workflow_chain_set\",\n", + " \"workflow_execution_set\",\n", + " \"metagenome_annotation_set\",\n", + " \"metagenome_assembly_set\",\n", + " \"metatranscriptome_assembly_set\",\n", + " \"metatranscriptome_annotation_set\",\n", + " \"metatranscriptome_analysis_set\",\n", + " \"mags_analysis_set\",\n", + " \"metagenome_sequencing_set\",\n", + " \"read_qc_analysis_set\",\n", + " \"read_based_taxonomy_analysis_set\",\n", + " \"metabolomics_analysis_set\",\n", + " \"metaproteomics_analysis_set\",\n", + " \"nom_analysis_set\",\n", + "]\n", + "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR195_to_unknown.py\n", + "from_PR195_to_unknown = [\n", + " \"workflow_execution_set\",\n", + " \"workflow_chain_set\",\n", + "]\n", "\n", "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", "COLLECTION_NAMES: list[str] = [\n", @@ -228,7 +268,9 @@ " *from_PR10_to_PR3,\n", " *from_X_to_PR176,\n", " *from_PR176_to_PR104,\n", - " #*from_PR104_to_PR195,\n", + " *from_X_to_PR192,\n", + " *from_PR104_to_PR195,\n", + " *from_PR195_to_unknown,\n", "]\n", "print(str(len(COLLECTION_NAMES)) + \" collection names\")\n", "\n", @@ -272,11 +314,7 @@ " $ cp .mongo.yaml.example .mongo.origin.yaml\n", " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", " ```\n", - " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", - "\n", - "```py\n", - "# TODO: On the Berkeley Mongo server, create the migrator Mongo user that has the new `nmdc-migrator` role; then update `.mongo.origin.yaml`.\n", - "```" + " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database." ] }, { @@ -315,7 +353,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc8" + "%pip install nmdc-schema==11.0.0rc11" ] }, { @@ -340,7 +378,6 @@ "from jsonschema import Draft7Validator\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", - "\n", "from nmdc_schema.migrators.migrator_from_X_to_unknown import Migrator as M1\n", "from nmdc_schema.migrators.migrator_from_X_to_PR23 import Migrator as M2\n", "from nmdc_schema.migrators.migrator_from_X_to_PR4 import Migrator as M3\n", @@ -348,13 +385,16 @@ "from nmdc_schema.migrators.migrator_from_X_to_PR21 import Migrator as M5\n", "from nmdc_schema.migrators.migrator_from_X_to_PR129 import Migrator as M6\n", "from nmdc_schema.migrators.migrator_from_X_to_PR31 import Migrator as M7\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M8\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M8 #### Creates the `workflow_chain_set` collection.\n", "from nmdc_schema.migrators.migrator_from_X_to_PR19_and_PR70 import Migrator as M9\n", "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M10\n", "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M11\n", "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M12\n", "from nmdc_schema.migrators.migrator_from_X_to_PR176 import Migrator as M13\n", "from nmdc_schema.migrators.migrator_from_PR176_to_PR104 import Migrator as M14\n", + "from nmdc_schema.migrators.migrator_from_X_to_PR192 import Migrator as M15\n", + "from nmdc_schema.migrators.migrator_from_PR104_to_PR195 import Migrator as M16\n", + "from nmdc_schema.migrators.migrator_from_PR195_to_unknown import Migrator as M17\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -434,9 +474,11 @@ "collapsed": false }, "source": [ - "### Create a bookkeeper\n", + "### Create JSON Schema validator\n", "\n", - "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." + "In this step, you'll create a JSON Schema validator for the NMDC Schema.\n", + "\n", + "- TODO: Consider whether the JSON Schema validator version is consistent with the JSON Schema version (e.g. draft 7 versus draft 2019)." ] }, { @@ -447,29 +489,9 @@ "collapsed": false }, "outputs": [], - "source": [ - "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" - ] - }, - { - "cell_type": "markdown", - "id": "3975ac24", - "metadata": {}, - "source": [ - "### Create JSON Schema validator\n", - "\n", - "In this step, you'll create a JSON Schema validator for the NMDC Schema." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e2dbb92", - "metadata": {}, - "outputs": [], "source": [ "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", - "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema) # Is there a newer validator class available; e.g. draft 2019?\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", "\n", "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", @@ -481,7 +503,7 @@ }, { "cell_type": "markdown", - "id": "74e2960b", + "id": "3975ac24", "metadata": {}, "source": [ "### TODO: Revoke write access to the \"origin\" MongoDB server\n", @@ -530,7 +552,7 @@ }, { "cell_type": "markdown", - "id": "c3e3c9c4", + "id": "fd4994a0", "metadata": {}, "source": [ "### Load the dumped collections into the \"transformer\" MongoDB server\n", @@ -545,7 +567,7 @@ { "cell_type": "code", "execution_count": null, - "id": "418571c5", + "id": "cf8fa1ca", "metadata": {}, "outputs": [], "source": [ @@ -567,7 +589,7 @@ }, { "cell_type": "markdown", - "id": "4c090068", + "id": "c3e3c9c4", "metadata": {}, "source": [ "### Transform the collections within the \"transformer\" MongoDB server\n", @@ -615,6 +637,9 @@ "migrator12 = M12(adapter=adapter)\n", "migrator13 = M13(adapter=adapter)\n", "migrator14 = M14(adapter=adapter)\n", + "migrator15 = M15(adapter=adapter)\n", + "migrator16 = M16(adapter=adapter)\n", + "migrator17 = M17(adapter=adapter)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", @@ -644,35 +669,49 @@ "print(f\"[{get_timestamp()}] Calling migrator13.upgrade()\")\n", "migrator13.upgrade()\n", "print(f\"[{get_timestamp()}] Calling migrator14.upgrade()\")\n", - "migrator14.upgrade()" + "migrator14.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator15.upgrade()\")\n", + "migrator15.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator16.upgrade()\")\n", + "migrator16.upgrade()\n", + "print(f\"[{get_timestamp()}] Calling migrator17.upgrade()\")\n", + "migrator17.upgrade()" ] }, { "cell_type": "markdown", - "id": "3edf77c7", + "id": "4c090068", "metadata": {}, "source": [ "### Validate the transformed documents\n", "\n", "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server.\n", "\n", - "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" + "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", + "\n", + "- TODO: Consider validating the (large) `functional_annotation_agg` collection _last_ so we find out about validation errors, if any, in _other_ (smaller) collections sooner." ] }, { "cell_type": "code", "execution_count": null, - "id": "db6e432d", + "id": "05869340", "metadata": {}, "outputs": [], "source": [ - "# FIXME: Only validate documents in the collections that we will be restoring.\n", - "# Note: If a collection named in `COLLECTION_NAMES` doesn't exist anymore,\n", - "# I think the inner `for` loop will just have zero iterations.\n", - "\n", + "# TODO: Only validate documents in the collections that we will be restoring.\n", + "# Note: If a collection named in `COLLECTION_NAMES` doesn't exist anymore,\n", + "# the inner `for` loop will just have zero iterations.\n", "for collection_name in COLLECTION_NAMES:\n", - " print(collection_name)\n", + " \n", + " # FIXME: Temporarily skip collections I know are invalid (so I can test the others)!\n", + " if collection_name in []:\n", + " continue\n", + "\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " num_documents_in_collection = collection.count_documents({})\n", + " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents)\")\n", + "\n", " for document in collection.find():\n", " # Validate the transformed document.\n", " #\n", @@ -691,6 +730,32 @@ " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" ] }, + { + "cell_type": "markdown", + "id": "3edf77c7", + "metadata": {}, + "source": [ + "### Dump the collections from the \"transformer\" MongoDB server\n", + "\n", + "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db6e432d", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the \"transformer\" MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, { "cell_type": "markdown", "id": "997fcb281d9d3222", @@ -698,34 +763,19 @@ "collapsed": false }, "source": [ - "### Indicate that the migration is underway\n", + "### Create a bookkeeper\n", "\n", - "Add an entry to the migration log collection to indicate that this migration has started." + "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." ] }, { "cell_type": "code", "execution_count": null, - "id": "fcafd862e1becb98", - "metadata": { - "collapsed": false - }, + "id": "dbbe706d", + "metadata": {}, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" ] }, { @@ -733,9 +783,9 @@ "id": "1e0c8891", "metadata": {}, "source": [ - "### Dump the collections from the \"transformer\" MongoDB server\n", + "### Indicate that the migration is underway\n", "\n", - "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + "Add an entry to the migration log collection to indicate that this migration has started." ] }, { @@ -745,13 +795,23 @@ "metadata": {}, "outputs": [], "source": [ - "# Dump the database from the \"transformer\" MongoDB server.\n", - "!{mongodump} \\\n", - " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", - " --db=\"nmdc\" \\\n", - " --gzip \\\n", - " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", - " {exclusion_options_str}" + "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator15, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator16, event=MigrationEvent.MIGRATION_STARTED)\n", + "bookkeeper.record_migration_event(migrator=migrator17, event=MigrationEvent.MIGRATION_STARTED)" ] }, { @@ -829,7 +889,10 @@ "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)\n", "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator15, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator16, event=MigrationEvent.MIGRATION_COMPLETED)\n", + "bookkeeper.record_migration_event(migrator=migrator17, event=MigrationEvent.MIGRATION_COMPLETED)" ] }, { From e64bdf453fcfd37e7064ea9d487729ea4cfec498 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 12 Jun 2024 19:38:51 -0700 Subject: [PATCH 06/27] Run top-level migrator instead of partial migrators Note: This depends upon a version of nmdc-schema that hasn't been published to PyPI yet, and which doesn't have a version number yet. --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 134 +++--------------- 1 file changed, 20 insertions(+), 114 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index b7a572ea..419f7493 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -37,7 +37,7 @@ "source": [ "### 1. Determine MongoDB collections involved.\n", "\n", - "To determine this, we look at all the migrators listed in this \"meta issue\": https://github.com/microbiomedata/nmdc-schema/issues/1607. In each migrator, we make note of which collections are involved (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", + "To determine this, we look at the migrator, itself (it's currently in https://github.com/microbiomedata/berkeley-schema-fy24/blob/97220bd1fd39a81a2b446744a45c4f9402d48eb9/nmdc_schema/migrators/migrator_from_10_3_0_to_11_0_0.py). We make note of which collections are referenced by that migrator (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", "\n", "```py\n", "# TODO: Consider separating them into two lists: `COLLECTIONS_TO_DUMP` and `COLLECTIONS_TO_RESTORE`.\n", @@ -353,7 +353,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc11" + "%pip install nmdc-schema==11.0.0rcXX" ] }, { @@ -378,23 +378,7 @@ "from jsonschema import Draft7Validator\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", - "from nmdc_schema.migrators.migrator_from_X_to_unknown import Migrator as M1\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR23 import Migrator as M2\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR4 import Migrator as M3\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR53 import Migrator as M4\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR21 import Migrator as M5\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR129 import Migrator as M6\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR31 import Migrator as M7\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR9 import Migrator as M8 #### Creates the `workflow_chain_set` collection.\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR19_and_PR70 import Migrator as M9\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR2_and_PR24 import Migrator as M10\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR10 import Migrator as M11\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR3 import Migrator as M12\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR176 import Migrator as M13\n", - "from nmdc_schema.migrators.migrator_from_PR176_to_PR104 import Migrator as M14\n", - "from nmdc_schema.migrators.migrator_from_X_to_PR192 import Migrator as M15\n", - "from nmdc_schema.migrators.migrator_from_PR104_to_PR195 import Migrator as M16\n", - "from nmdc_schema.migrators.migrator_from_PR195_to_unknown import Migrator as M17\n", + "from nmdc_schema.migrators.migrator_from_10_3_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -608,12 +592,6 @@ "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime\n", - "\n", - "def get_timestamp():\n", - " return datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", - "\n", - "\n", "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", "adapter = MongoAdapter(\n", " database=transformer_mongo_client[\"nmdc\"],\n", @@ -622,60 +600,11 @@ " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", ")\n", "\n", - "# Instantiate Migrators bound to that adapter.\n", - "migrator1 = M1(adapter=adapter)\n", - "migrator2 = M2(adapter=adapter)\n", - "migrator3 = M3(adapter=adapter)\n", - "migrator4 = M4(adapter=adapter)\n", - "migrator5 = M5(adapter=adapter)\n", - "migrator6 = M6(adapter=adapter)\n", - "migrator7 = M7(adapter=adapter)\n", - "migrator8 = M8(adapter=adapter)\n", - "migrator9 = M9(adapter=adapter)\n", - "migrator10 = M10(adapter=adapter)\n", - "migrator11 = M11(adapter=adapter)\n", - "migrator12 = M12(adapter=adapter)\n", - "migrator13 = M13(adapter=adapter)\n", - "migrator14 = M14(adapter=adapter)\n", - "migrator15 = M15(adapter=adapter)\n", - "migrator16 = M16(adapter=adapter)\n", - "migrator17 = M17(adapter=adapter)\n", + "# Instantiate a Migrator bound to that adapter.\n", + "migrator = Migrator(adapter=adapter)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", - "print(f\"[{get_timestamp()}] Calling migrator1.upgrade()\")\n", - "migrator1.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator2.upgrade()\")\n", - "migrator2.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator3.upgrade()\")\n", - "migrator3.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator4.upgrade()\")\n", - "migrator4.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator5.upgrade()\")\n", - "migrator5.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator6.upgrade()\")\n", - "migrator6.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator7.upgrade()\")\n", - "migrator7.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator8.upgrade()\")\n", - "migrator8.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator9.upgrade()\")\n", - "migrator9.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator10.upgrade()\")\n", - "migrator10.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator11.upgrade()\")\n", - "migrator11.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator12.upgrade()\")\n", - "migrator12.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator13.upgrade()\")\n", - "migrator13.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator14.upgrade()\")\n", - "migrator14.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator15.upgrade()\")\n", - "migrator15.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator16.upgrade()\")\n", - "migrator16.upgrade()\n", - "print(f\"[{get_timestamp()}] Calling migrator17.upgrade()\")\n", - "migrator17.upgrade()" + "migrator.upgrade()" ] }, { @@ -699,10 +628,19 @@ "metadata": {}, "outputs": [], "source": [ + "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `COLLECTION_NAMES`,\n", + "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", + "# other collections without having to wait for that (large) collection to be validated before them.\n", + "ordered_collection_names = COLLECTION_NAMES.copy().sorted()\n", + "large_collection_name = \"functional_annotation_agg\"\n", + "if large_collection_name in ordered_collection_names:\n", + " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", + " ordered_collection_names.append(large_collection_name)\n", + "\n", "# TODO: Only validate documents in the collections that we will be restoring.\n", - "# Note: If a collection named in `COLLECTION_NAMES` doesn't exist anymore,\n", - "# the inner `for` loop will just have zero iterations.\n", - "for collection_name in COLLECTION_NAMES:\n", + "# Note: If a collection listed in `COLLECTION_NAMES` doesn't exist in the transformation\n", + "# database anymore, the inner `for` loop will just have zero iterations.\n", + "for collection_name in ordered_collection_names:\n", " \n", " # FIXME: Temporarily skip collections I know are invalid (so I can test the others)!\n", " if collection_name in []:\n", @@ -795,23 +733,7 @@ "metadata": {}, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator15, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator16, event=MigrationEvent.MIGRATION_STARTED)\n", - "bookkeeper.record_migration_event(migrator=migrator17, event=MigrationEvent.MIGRATION_STARTED)" + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" ] }, { @@ -876,23 +798,7 @@ }, "outputs": [], "source": [ - "bookkeeper.record_migration_event(migrator=migrator1, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator2, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator3, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator4, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator5, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator6, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator7, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator8, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator9, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator10, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator11, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator12, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator13, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator14, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator15, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator16, event=MigrationEvent.MIGRATION_COMPLETED)\n", - "bookkeeper.record_migration_event(migrator=migrator17, event=MigrationEvent.MIGRATION_COMPLETED)" + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" ] }, { From 82db1f406a49644ff1291092af709cf3c86497f9 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 13 Jun 2024 22:19:58 -0700 Subject: [PATCH 07/27] Use schema `11.0.0rc12` and split some notebook cells --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 419f7493..ab017ede 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -353,7 +353,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rcXX" + "%pip install nmdc-schema==11.0.0rc12" ] }, { @@ -378,7 +378,7 @@ "from jsonschema import Draft7Validator\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", - "from nmdc_schema.migrators.migrator_from_10_3_0_to_11_0_0 import Migrator\n", + "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", "from helpers import Config\n", @@ -514,7 +514,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf8fa1ca", + "id": "831ac241", "metadata": {}, "outputs": [], "source": [ @@ -523,8 +523,16 @@ "non_agenda_collection_names = [name for name in all_collection_names if name not in COLLECTION_NAMES]\n", "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", - "print(exclusion_options_str)\n", - "\n", + "print(exclusion_options_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8fa1ca", + "metadata": {}, + "outputs": [], + "source": [ "# Dump the not-excluded collections from the \"origin\" database.\n", "!{mongodump} \\\n", " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", @@ -551,15 +559,23 @@ { "cell_type": "code", "execution_count": null, - "id": "cf8fa1ca", + "id": "c4acae55", "metadata": {}, "outputs": [], "source": [ "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", - "print(inclusion_options_str)\n", - "\n", + "print(inclusion_options_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8fa1ca", + "metadata": {}, + "outputs": [], + "source": [ "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", "!{mongorestore} \\\n", " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", @@ -631,7 +647,7 @@ "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `COLLECTION_NAMES`,\n", "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", "# other collections without having to wait for that (large) collection to be validated before them.\n", - "ordered_collection_names = COLLECTION_NAMES.copy().sorted()\n", + "ordered_collection_names = sorted(COLLECTION_NAMES.copy())\n", "large_collection_name = \"functional_annotation_agg\"\n", "if large_collection_name in ordered_collection_names:\n", " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", @@ -721,7 +737,7 @@ "id": "1e0c8891", "metadata": {}, "source": [ - "### Indicate that the migration is underway\n", + "### Indicate — on the \"origin\" server — that the migration is underway\n", "\n", "Add an entry to the migration log collection to indicate that this migration has started." ] @@ -755,7 +771,11 @@ "\n", "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", "\n", - "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." + "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually.\n", + "\n", + "- Consider using the `--preserveUUID` CLI option\n", + "\n", + "> Estimated time when running on laptop: 46 minutes" ] }, { @@ -773,7 +793,6 @@ " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", " --drop \\\n", " --stopOnError \\\n", - " --preserveUUID \\\n", " {inclusion_options_str}" ] }, From d5ae6c2e1dd80e41f580a22dd9b69c1669e08dc2 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 15 Jun 2024 01:29:10 -0700 Subject: [PATCH 08/27] Write migrator logging messages to a log file --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 55 +++++++++++++++++-- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index ab017ede..12e45248 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -212,6 +212,23 @@ " \"extraction_set\",\n", "]\n", "\n", + "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/partials/migrator_from_10_2_0_to_11_0_0/migrator_from_X_to_PR104.py\n", + "from_X_to_PR104 = [\n", + " \"workflow_execution_set\",\n", + " \"metagenome_annotation_set\",\n", + " \"metagenome_assembly_set\",\n", + " \"metatranscriptome_assembly_set\",\n", + " \"metatranscriptome_annotation_set\",\n", + " \"metatranscriptome_analysis_set\",\n", + " \"mags_analysis_set\",\n", + " \"metagenome_sequencing_set\",\n", + " \"read_qc_analysis_set\",\n", + " \"read_based_taxonomy_analysis_set\",\n", + " \"metabolomics_analysis_set\",\n", + " \"metaproteomics_analysis_set\",\n", + " \"nom_analysis_set\",\n", + "]\n", + "\n", "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR104_to_PR195.py\n", "from_PR104_to_PR195 = [\n", " \"collecting_biosamples_from_site_set\",\n", @@ -269,6 +286,7 @@ " *from_X_to_PR176,\n", " *from_PR176_to_PR104,\n", " *from_X_to_PR192,\n", + " *from_X_to_PR104,\n", " *from_PR104_to_PR195,\n", " *from_PR195_to_unknown,\n", "]\n", @@ -314,7 +332,9 @@ " $ cp .mongo.yaml.example .mongo.origin.yaml\n", " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", " ```\n", - " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database." + " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", + "\n", + "- TODO: Be more specific about the Mongo privileges necessary to perform a `mongodump` and a `mongorestore` that may involve creating/deleting collections." ] }, { @@ -353,7 +373,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc12" + "%pip install nmdc-schema==11.0.0rc13" ] }, { @@ -376,7 +396,7 @@ "# Third-party packages:\n", "import pymongo\n", "from jsonschema import Draft7Validator\n", - "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", @@ -474,7 +494,7 @@ }, "outputs": [], "source": [ - "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n", + "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict(variant=SchemaVariantIdentifier.nmdc_materialized_patterns)\n", "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", "\n", "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", @@ -601,6 +621,29 @@ "> Reminder: The \"origin\" database is **not** affected by this step." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "df8ee3da", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Setup a logger that writes to a file.\n", + "# TODO: Move this logger stuff to `helpers.py`.`\n", + "LOG_FILE_PATH = \"./.tmp.log\"\n", + "logger = logging.getLogger(name=\"migrator_logger\")\n", + "logger.setLevel(logging.DEBUG)\n", + "file_handler = logging.FileHandler(LOG_FILE_PATH)\n", + "formatter = logging.Formatter(fmt=\"[[%(asctime)s][%(name)s][%(levelname)s]] %(message)s\",\n", + " datefmt=\"%Y-%m-%d %H:%M:%S.%f\")\n", + "file_handler.setFormatter(formatter)\n", + "if logger.hasHandlers():\n", + " logger.handlers.clear() # avoid duplicate log entries\n", + "logger.addHandler(file_handler)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -617,7 +660,7 @@ ")\n", "\n", "# Instantiate a Migrator bound to that adapter.\n", - "migrator = Migrator(adapter=adapter)\n", + "migrator = Migrator(adapter=adapter, logger=logger)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "migrator.upgrade()" @@ -775,7 +818,7 @@ "\n", "- Consider using the `--preserveUUID` CLI option\n", "\n", - "> Estimated time when running on laptop: 46 minutes" + "> Estimated time when running on laptop: 15 minutes" ] }, { From e7f83e9b480741b255a3ad97c8962d105a310374 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 15 Jun 2024 01:32:19 -0700 Subject: [PATCH 09/27] Give log file a name that is ignored by Git --- .../metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 12e45248..9f141255 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -632,7 +632,7 @@ "\n", "# Setup a logger that writes to a file.\n", "# TODO: Move this logger stuff to `helpers.py`.`\n", - "LOG_FILE_PATH = \"./.tmp.log\"\n", + "LOG_FILE_PATH = \"./tmp.log\"\n", "logger = logging.getLogger(name=\"migrator_logger\")\n", "logger.setLevel(logging.DEBUG)\n", "file_handler = logging.FileHandler(LOG_FILE_PATH)\n", From 87240650651171eed205517bb8027f9d74a14f82 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 17 Jun 2024 21:08:54 -0700 Subject: [PATCH 10/27] Use schema `11.0.0rc16`, update collection names, and reformat log output --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 9f141255..0420aea1 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -220,7 +220,7 @@ " \"metatranscriptome_assembly_set\",\n", " \"metatranscriptome_annotation_set\",\n", " \"metatranscriptome_analysis_set\",\n", - " \"mags_analysis_set\",\n", + " \"mags_set\",\n", " \"metagenome_sequencing_set\",\n", " \"read_qc_analysis_set\",\n", " \"read_based_taxonomy_analysis_set\",\n", @@ -254,7 +254,7 @@ " \"metatranscriptome_assembly_set\",\n", " \"metatranscriptome_annotation_set\",\n", " \"metatranscriptome_analysis_set\",\n", - " \"mags_analysis_set\",\n", + " \"mags_set\",\n", " \"metagenome_sequencing_set\",\n", " \"read_qc_analysis_set\",\n", " \"read_based_taxonomy_analysis_set\",\n", @@ -373,7 +373,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc13" + "%pip install nmdc-schema==11.0.0rc16" ] }, { @@ -636,8 +636,8 @@ "logger = logging.getLogger(name=\"migrator_logger\")\n", "logger.setLevel(logging.DEBUG)\n", "file_handler = logging.FileHandler(LOG_FILE_PATH)\n", - "formatter = logging.Formatter(fmt=\"[[%(asctime)s][%(name)s][%(levelname)s]] %(message)s\",\n", - " datefmt=\"%Y-%m-%d %H:%M:%S.%f\")\n", + "formatter = logging.Formatter(fmt=\"%(asctime)s\\t%(name)s\\t%(levelname)s\\t%(message)s\",\n", + " datefmt=\"%Y-%m-%d %H:%M:%S\")\n", "file_handler.setFormatter(formatter)\n", "if logger.hasHandlers():\n", " logger.handlers.clear() # avoid duplicate log entries\n", @@ -702,7 +702,7 @@ "for collection_name in ordered_collection_names:\n", " \n", " # FIXME: Temporarily skip collections I know are invalid (so I can test the others)!\n", - " if collection_name in []:\n", + " if collection_name in [\"data_object_set\", \"workflow_execution_set\", \"functional_annotation_agg\"]:\n", " continue\n", "\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", @@ -818,7 +818,7 @@ "\n", "- Consider using the `--preserveUUID` CLI option\n", "\n", - "> Estimated time when running on laptop: 15 minutes" + "> Estimated time when running on laptop: 17 minutes" ] }, { From 836f07d6a96b9af005d0c87d25f349886d5a8d39 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 18 Jun 2024 10:09:24 -0700 Subject: [PATCH 11/27] Dump/restore all collections from/to origin (temporary change) --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 0420aea1..71c3787a 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -19,7 +19,9 @@ "\n", "This notebook is special. Unlike all previous notebooks, each of which only used a single migrator; this notebook will be using multiple migrators.\n", "\n", - "This notebook will be used to migrate the database from `v10.2.0` (i.e. the final version of the pre-Berkeley schema) to `v11.0.0` (i.e. the initial version of the Berkeley schema)." + "This notebook will be used to migrate the database from `v10.2.0` to `v11.0.0` (i.e. the initial version of the Berkeley schema).\n", + "\n", + "- TODO: In reality, it may be used to migrate the database from `v10.5.4`, or any other `nmdc-schema` release that happens between now and when we switch over to the Berkeley schema." ] }, { @@ -37,10 +39,10 @@ "source": [ "### 1. Determine MongoDB collections involved.\n", "\n", - "To determine this, we look at the migrator, itself (it's currently in https://github.com/microbiomedata/berkeley-schema-fy24/blob/97220bd1fd39a81a2b446744a45c4f9402d48eb9/nmdc_schema/migrators/migrator_from_10_3_0_to_11_0_0.py). We make note of which collections are referenced by that migrator (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", + "To determine this, we look at the migrator, itself (it's currently in https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_10_2_0_to_11_0_0.py). We make note of which collections are referenced by that migrator (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", "\n", "```py\n", - "# TODO: Consider separating them into two lists: `COLLECTIONS_TO_DUMP` and `COLLECTIONS_TO_RESTORE`.\n", + "# TODO: Consider separating them into two lists: `COLLECTIONS_TO_DUMP` and `COLLECTIONS_TO_RESTORE`. Or, make a list of collections that I will manually delete from the origin server after running this notebook.\n", "```" ] }, @@ -553,6 +555,9 @@ "metadata": {}, "outputs": [], "source": [ + "# FIXME: Temporarily excluding nothing so that everything gets dumped!\n", + "exclusion_options_str = \"\"\n", + "\n", "# Dump the not-excluded collections from the \"origin\" database.\n", "!{mongodump} \\\n", " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", @@ -573,7 +578,9 @@ "\n", "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database.\n", "\n", - "> Note: This step typically takes 3 minutes (on a MacBook Pro M1, when running MongoDB in a Docker container)." + "> Note: This step typically takes 3 minutes (on a MacBook Pro M1, when running MongoDB in a Docker container).\n", + "\n", + "- TODO: Are \"views\" included in `mongodump` dumps? If so, how does `mongorestore` handle them—does it restore them as \"views\" or as normal \"collections\"?" ] }, { @@ -596,6 +603,9 @@ "metadata": {}, "outputs": [], "source": [ + "# FIXME: Temporarily include nothing explicitly so that everything gets restored!\n", + "inclusion_options_str = \"\"\n", + "\n", "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", "!{mongorestore} \\\n", " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", From 88fe6e0573097c92ee90adcd2defcedf7a02a2f9 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 21 Jun 2024 23:48:08 -0700 Subject: [PATCH 12/27] Configure Git to ignore Mongo configuration file variants --- demo/metadata_migration/notebooks/.gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/demo/metadata_migration/notebooks/.gitignore b/demo/metadata_migration/notebooks/.gitignore index 000100c5..255005c5 100644 --- a/demo/metadata_migration/notebooks/.gitignore +++ b/demo/metadata_migration/notebooks/.gitignore @@ -1,5 +1,4 @@ /.notebook.env -/.mongo.origin.yaml -/.mongo.transformer.yaml +/.mongo.*.yaml /mongodump.*.out /tmp.* \ No newline at end of file From f9de28be6390865537933bb8125dc675b95218c8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 25 Jun 2024 15:06:48 -0700 Subject: [PATCH 13/27] Update notebook to delete existing dumps and (temporarily) dump _all_ collections --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index 71c3787a..ed58b0fd 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -7,7 +7,9 @@ "collapsed": true }, "source": [ - "# Migrate MongoDB database from `nmdc-schema` `v10.2.0` to `v11.0.0`" + "# Migrate MongoDB database from `nmdc-schema` `v10.2.0` to `v11.0.0`\n", + "\n", + "- TODO: Update the initial schema version in the heading, elsewhere in the notebook, and in the filename, to `v10.5.6`." ] }, { @@ -21,7 +23,7 @@ "\n", "This notebook will be used to migrate the database from `v10.2.0` to `v11.0.0` (i.e. the initial version of the Berkeley schema).\n", "\n", - "- TODO: In reality, it may be used to migrate the database from `v10.5.4`, or any other `nmdc-schema` release that happens between now and when we switch over to the Berkeley schema." + "- TODO: In reality, it may be used to migrate the database from `v10.5.6` (released June 25, 2024), or any other `nmdc-schema` release that happens between now and when we switch over to the Berkeley schema." ] }, { @@ -519,6 +521,27 @@ "Note: The migrator Mongo user may need additional permissions in order to manipulate Mongo user roles to the extent necessary to accomplish this step." ] }, + { + "cell_type": "markdown", + "id": "e60c8935", + "metadata": {}, + "source": [ + "### Delete obsolete dumps\n", + "\n", + "Delete any existing dumps so that the dumps you generate below will not be mixed in with any unrelated ones." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df48c2ce", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {cfg.origin_dump_folder_path}\n", + "!rm -rf {cfg.transformer_dump_folder_path}" + ] + }, { "cell_type": "markdown", "id": "fd4994a0", @@ -628,7 +651,9 @@ "\n", "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", "\n", - "> Reminder: The \"origin\" database is **not** affected by this step." + "> Reminder: The \"origin\" database is **not** affected by this step.\n", + "\n", + "- TODO: Consider deleting the existing log or appending a timestamp to the log filename." ] }, { @@ -826,8 +851,6 @@ "\n", "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually.\n", "\n", - "- Consider using the `--preserveUUID` CLI option\n", - "\n", "> Estimated time when running on laptop: 17 minutes" ] }, @@ -844,7 +867,7 @@ " --gzip \\\n", " --verbose \\\n", " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", - " --drop \\\n", + " --drop --preserveUUID \\\n", " --stopOnError \\\n", " {inclusion_options_str}" ] From 782d2b2558aad79673a0f12da52b320be11a08a1 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 25 Jun 2024 23:56:37 -0700 Subject: [PATCH 14/27] Update migration notebook to revoke/restore database access --- .../notebooks/.notebook.env.example | 22 +++++- demo/metadata_migration/notebooks/helpers.py | 33 ++++++++ .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 77 +++++++++++++++++-- .../restore-privileges.mongo.js | 75 ++++++++++++++++++ .../revoke-privileges.mongo.js | 19 +++++ 5 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js create mode 100644 demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js diff --git a/demo/metadata_migration/notebooks/.notebook.env.example b/demo/metadata_migration/notebooks/.notebook.env.example index 25d3e266..6bed63ec 100644 --- a/demo/metadata_migration/notebooks/.notebook.env.example +++ b/demo/metadata_migration/notebooks/.notebook.env.example @@ -6,6 +6,24 @@ PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE = "./.mongo.transformer.yaml" PATH_TO_ORIGIN_MONGO_DUMP_FOLDER = "./mongodump.origin.out" PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER = "./mongodump.transformer.out" -# These are absolute paths to the `mongodump` and `mongorestore` programs. -PATH_TO_MONGODUMP_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongodump" +# These are absolute paths to the `mongodump`, `mongorestore`, and `mongosh` programs. +PATH_TO_MONGODUMP_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongodump" PATH_TO_MONGORESTORE_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongorestore" +PATH_TO_MONGOSH_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongosh-1.10.6-darwin-x64/bin/mongosh" + +# TODO: Now that the notebooks require both (a) Mongo config files — see above, and +# (b) discrete connection parameters — see below, consider using the parameters +# below to either (I) dynamically create a Mongo config file, or (II) use them +# directly as CLI options to `mongodump` and `mongorestore`. + +# Connection parameters for the Origin Mongo server (typically a remote serve). +ORIGIN_MONGO_HOST="__REPLACE_ME__" +ORIGIN_MONGO_PORT="__REPLACE_ME__" +ORIGIN_MONGO_USERNAME="__REPLACE_ME__" +ORIGIN_MONGO_PASSWORD="__REPLACE_ME__" + +# Connection parameters for the Transformer Mongo server (typically a local server). +TRANSFORMER_MONGO_HOST="__REPLACE_ME__" +TRANSFORMER_MONGO_PORT="__REPLACE_ME__" +TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__" +TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__" diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index d513af9f..bfcdd9cf 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -54,12 +54,25 @@ def parse_and_validate_notebook_config_file( # Validate the binary paths. mongodump_path = notebook_config["PATH_TO_MONGODUMP_BINARY"] mongorestore_path = notebook_config["PATH_TO_MONGORESTORE_BINARY"] + mongosh_path = notebook_config["PATH_TO_MONGOSH_BINARY"] if not Path(mongodump_path).is_file(): raise FileNotFoundError(f"mongodump binary not found at: {mongodump_path}") if not Path(mongorestore_path).is_file(): raise FileNotFoundError( f"mongorestore binary not found at: {mongorestore_path}" ) + if not Path(mongosh_path).is_file(): + raise FileNotFoundError(f"mongosh binary not found at: {mongosh_path}") + + origin_mongo_host = notebook_config["ORIGIN_MONGO_HOST"] + origin_mongo_port = notebook_config["ORIGIN_MONGO_PORT"] + origin_mongo_username = notebook_config["ORIGIN_MONGO_USERNAME"] + origin_mongo_password = notebook_config["ORIGIN_MONGO_PASSWORD"] + + transformer_mongo_host = notebook_config["TRANSFORMER_MONGO_HOST"] + transformer_mongo_port = notebook_config["TRANSFORMER_MONGO_PORT"] + transformer_mongo_username = notebook_config["TRANSFORMER_MONGO_USERNAME"] + transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"] return dict( origin_mongo_config_file_path=origin_mongo_config_file_path, @@ -68,6 +81,15 @@ def parse_and_validate_notebook_config_file( transformer_dump_folder_path=transformer_dump_folder_path, mongodump_path=mongodump_path, mongorestore_path=mongorestore_path, + mongosh_path=mongosh_path, + origin_mongo_host=origin_mongo_host, + origin_mongo_port=origin_mongo_port, + origin_mongo_username=origin_mongo_username, + origin_mongo_password=origin_mongo_password, + transformer_mongo_host=transformer_mongo_host, + transformer_mongo_port=transformer_mongo_port, + transformer_mongo_username=transformer_mongo_username, + transformer_mongo_password=transformer_mongo_password, ) def parse_and_validate_mongo_config_file( @@ -93,6 +115,7 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: ) self.mongodump_path = notebook_config["mongodump_path"] self.mongorestore_path = notebook_config["mongorestore_path"] + self.mongosh_path = notebook_config["mongosh_path"] self.origin_dump_folder_path = notebook_config["origin_dump_folder_path"] self.transformer_dump_folder_path = notebook_config[ "transformer_dump_folder_path" @@ -113,3 +136,13 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: ) self.origin_mongo_server_uri = origin_mongo_server_config["uri"] self.transformer_mongo_server_uri = transformer_mongo_server_config["uri"] + + # Parse the Mongo connection parameters. + self.origin_mongo_host = notebook_config["origin_mongo_host"] + self.origin_mongo_port = notebook_config["origin_mongo_port"] + self.origin_mongo_username = notebook_config["origin_mongo_username"] + self.origin_mongo_password = notebook_config["origin_mongo_password"] + self.transformer_mongo_host = notebook_config["transformer_mongo_host"] + self.transformer_mongo_port = notebook_config["transformer_mongo_port"] + self.transformer_mongo_username = notebook_config["transformer_mongo_username"] + self.transformer_mongo_password = notebook_config["transformer_mongo_password"] \ No newline at end of file diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index ed58b0fd..bf18d680 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -397,6 +397,9 @@ "metadata": {}, "outputs": [], "source": [ + "# Standard library packages:\n", + "import subprocess\n", + "\n", "# Third-party packages:\n", "import pymongo\n", "from jsonschema import Draft7Validator\n", @@ -431,10 +434,12 @@ "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", "mongodump = cfg.mongodump_path\n", "mongorestore = cfg.mongorestore_path\n", + "mongosh = cfg.mongosh_path\n", "\n", "# Perform a sanity test of the application paths.\n", "!{mongodump} --version\n", - "!{mongorestore} --version" + "!{mongorestore} --version\n", + "!{mongosh} --version" ] }, { @@ -514,11 +519,49 @@ "id": "3975ac24", "metadata": {}, "source": [ - "### TODO: Revoke write access to the \"origin\" MongoDB server\n", + "### Revoke access from the \"origin\" MongoDB server\n", + "\n", + "We revoke \"write\" access so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data.\n", + "\n", + "We also revoke \"read\" access. The revocation of \"read\" access is technically optional, but (a) the JavaScript script will be easier for me to maintain if it revokes everything and (b) this prevents people from reading data during the restore step, during which the database may not be self-consistent.\n", "\n", - "This is so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data.\n", + "- TODO: Consider allowing \"read\" access until _just before_ the restore step.\n", "\n", - "Note: The migrator Mongo user may need additional permissions in order to manipulate Mongo user roles to the extent necessary to accomplish this step." + "References:\n", + "\n", + "- https://docs.python.org/3/library/subprocess.html\n", + "- https://www.mongodb.com/docs/mongodb-shell/reference/options/\n", + "- https://www.mongodb.com/docs/mongodb-shell/write-scripts/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f761caad", + "metadata": {}, + "outputs": [], + "source": [ + "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", + "# because one of the CLI options contains the Mongo password (since `mongosh` does not support the\n", + "# use of config files located anywhere except in the user's home directory) and my gut tells me\n", + "# this approach makes it less likely that the password appear in some shell history compared to\n", + "# if the command were run via a `!` command (since, to me, the latter more closely resembles\n", + "# regular shell usage).\n", + "#\n", + "# TODO: Revisit this; and consider switching all the other `!` commands to use `subprocess`\n", + "# so that this notebook is closer to becoming a regular Python script.\n", + "#\n", + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} \\\n", + " --host='{cfg.origin_mongo_host}' \\\n", + " --port='{cfg.origin_mongo_port}' \\\n", + " --username='{cfg.origin_mongo_username}' \\\n", + " --password='{cfg.origin_mongo_password}' \\\n", + " --quiet \\\n", + " --file='mongosh-scripts/revoke-privileges.mongo.js'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" ] }, { @@ -837,7 +880,9 @@ "source": [ "### TODO: Drop the original collections from the \"origin\" MongoDB server\n", "\n", - "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump. We may need `mongosh` for this." + "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump. We may need `mongosh` for this.\n", + "\n", + "- TODO: Now that the notebook does depend upon `mongosh`, revisit filling in this step." ] }, { @@ -901,10 +946,30 @@ "id": "04c856a8", "metadata": {}, "source": [ - "### TODO: Reinstate write access to the MongoDB server\n", + "### Restore access to the MongoDB server\n", "\n", "This effectively un-does the access revocation that we did earlier." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aab3c7e", + "metadata": {}, + "outputs": [], + "source": [ + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} \\\n", + " --host='{cfg.origin_mongo_host}' \\\n", + " --port='{cfg.origin_mongo_port}' \\\n", + " --username='{cfg.origin_mongo_username}' \\\n", + " --password='{cfg.origin_mongo_password}' \\\n", + " --quiet \\\n", + " --file='mongosh-scripts/restore-privileges.mongo.js'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ] } ], "metadata": { diff --git a/demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js b/demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js new file mode 100644 index 00000000..b9f3a9f9 --- /dev/null +++ b/demo/metadata_migration/notebooks/mongosh-scripts/restore-privileges.mongo.js @@ -0,0 +1,75 @@ +/** + * This mongosh script restores all standard NMDC user-defined Mongo roles + * (except for the "nmdc_migrator" role) to their standard states. + * + * Note: This script contains excerpts from the authoritative user-defined role reference script at: + * https://github.com/microbiomedata/infra-admin/blob/main/mongodb/roles/createRoles.mongo.js + * You can compare this file to that one, using: https://www.diffchecker.com/text-compare/ + * + * Note: I select the database via `db.getSiblingDB()` since the `use` helper isn't available here. + * Reference: https://www.mongodb.com/docs/manual/reference/method/db.getSiblingDB/ + */ + +const db = db.getSiblingDB("admin"); + +db.updateRole("nmdc_runtime", { + privileges: [], + roles: [ + { db: "admin", role: "readWriteAnyDatabase" }, + { db: "admin", role: "dbAdminAnyDatabase" }, + ], +}); + +db.updateRole("nmdc_scheduler", { + privileges: [ + { resource: { db: "nmdc", collection: "jobs" }, actions: ["find", "insert", "update", "remove"] } + ], + roles: [ + { db: "nmdc", role: "read" }, + ], +}); + +db.updateRole("nmdc_aggregator", { + privileges: [ + { resource: { db: "nmdc", collection: "metap_gene_function_aggregation" }, actions: ["find", "insert", "update", "remove"] }, + { resource: { db: "nmdc", collection: "functional_annotation_agg" }, actions: ["find", "insert", "update", "remove"] }, + ], + roles: [ + { db: "nmdc", role: "read" }, + ], +}); + +db.updateRole("nmdc_reader", { + privileges: [ + { resource: { db: "", collection: "" }, actions: ["changeOwnPassword"] }, + ], + roles: [ + { db: "nmdc", role: "read" }, + { db: "nmdc_updated", role: "read" }, + { db: "nmdc_deleted", role: "read" }, + { db: "nmdc_changesheet_submission_results", role: "read" }, + ], +}); + +db.updateRole("nmdc_editor", { + privileges: [ + { resource: { db: "", collection: "" }, actions: ["changeOwnPassword"] }, + ], + roles: [ + { db: "nmdc", role: "readWrite" }, + { db: "nmdc_updated", role: "readWrite" }, + { db: "nmdc_deleted", role: "readWrite" }, + { db: "nmdc_changesheet_submission_results", role: "readWrite" }, + ], +}); + +db.updateRole("all_dumper", { + privileges: [ + { resource: { db: "config", collection: "system.preimages" }, actions: ["find"] }, + ], + roles: [ + { db: "admin", role: "backup" }, + ], +}); + +print("✅ Access restored."); \ No newline at end of file diff --git a/demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js b/demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js new file mode 100644 index 00000000..588ec21e --- /dev/null +++ b/demo/metadata_migration/notebooks/mongosh-scripts/revoke-privileges.mongo.js @@ -0,0 +1,19 @@ +/** + * This mongosh script revokes all privileges from all standard NMDC user-defined Mongo roles + * (except for the "nmdc_migrator" role). + * + * Note: I select the database via `db.getSiblingDB()` since the `use` helper isn't available here. + * Reference: https://www.mongodb.com/docs/manual/reference/method/db.getSiblingDB/ + */ + + +const db = db.getSiblingDB("admin"); + +db.updateRole("nmdc_reader", { privileges: [], roles: [] }); +db.updateRole("nmdc_editor", { privileges: [], roles: [] }); +db.updateRole("nmdc_runtime", { privileges: [], roles: [] }); +db.updateRole("nmdc_aggregator", { privileges: [], roles: [] }); +db.updateRole("nmdc_scheduler", { privileges: [], roles: [] }); +db.updateRole("all_dumper", { privileges: [], roles: [] }); + +print("✋ Access revoked."); \ No newline at end of file From 08da942206a58d2425cb8cb0113a34e165009321 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 00:35:58 -0700 Subject: [PATCH 15/27] Remove temporary code used to skip validating invalid data --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 5 ----- 1 file changed, 5 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index bf18d680..adab978e 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -778,11 +778,6 @@ "# Note: If a collection listed in `COLLECTION_NAMES` doesn't exist in the transformation\n", "# database anymore, the inner `for` loop will just have zero iterations.\n", "for collection_name in ordered_collection_names:\n", - " \n", - " # FIXME: Temporarily skip collections I know are invalid (so I can test the others)!\n", - " if collection_name in [\"data_object_set\", \"workflow_execution_set\", \"functional_annotation_agg\"]:\n", - " continue\n", - "\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", " num_documents_in_collection = collection.count_documents({})\n", " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents)\")\n", From e9e4b5496f5d187333a5474896fcc7abfce11464 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 23:08:58 -0700 Subject: [PATCH 16/27] Remove collection name lists and inclusion/exclusion options --- .../notebooks/migrate_10_2_0_to_11_0_0.ipynb | 455 +++--------------- 1 file changed, 68 insertions(+), 387 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb index adab978e..19887ce5 100644 --- a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb @@ -7,9 +7,7 @@ "collapsed": true }, "source": [ - "# Migrate MongoDB database from `nmdc-schema` `v10.2.0` to `v11.0.0`\n", - "\n", - "- TODO: Update the initial schema version in the heading, elsewhere in the notebook, and in the filename, to `v10.5.6`." + "# Migrate MongoDB database from `nmdc-schema` `v10.5.6` to `v11.0.0`" ] }, { @@ -19,11 +17,9 @@ "source": [ "## Introduction\n", "\n", - "This notebook is special. Unlike all previous notebooks, each of which only used a single migrator; this notebook will be using multiple migrators.\n", - "\n", - "This notebook will be used to migrate the database from `v10.2.0` to `v11.0.0` (i.e. the initial version of the Berkeley schema).\n", + "This notebook will be used to migrate the database from `nmdc-schema` `v10.5.6` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v10.5.6) June 25, 2024) to `v11.0.0` (i.e. the initial version of the Berkeley schema).\n", "\n", - "- TODO: In reality, it may be used to migrate the database from `v10.5.6` (released June 25, 2024), or any other `nmdc-schema` release that happens between now and when we switch over to the Berkeley schema." + "Unlike previous migrators, this one does not pick and choose which collections it will dump. There are two reasons for this: (1) migrators no longer have a dedicated `self.agenda` dictionary that indicates all the collections involved in the migration; and (2) this migration is the first one that involves creating, renaming, and dropping any collections; none of which were things that the old `self.agenda`-based system were designed to handle. So, instead of picking and choosing collections, this migrator **dumps them all.**" ] }, { @@ -36,309 +32,68 @@ }, { "cell_type": "markdown", - "id": "37d358ba", + "id": "17f351e8", "metadata": {}, "source": [ - "### 1. Determine MongoDB collections involved.\n", + "### 1. Coordinate with stakeholders.\n", "\n", - "To determine this, we look at the migrator, itself (it's currently in https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_10_2_0_to_11_0_0.py). We make note of which collections are referenced by that migrator (whether for reading or for writing) and add them to the `COLLECTION_NAMES` list below.\n", + "We will be enacting full Runtime and Database downtime for this migration. Ensure stakeholders are aware of that." + ] + }, + { + "cell_type": "markdown", + "id": "233a35c3", + "metadata": {}, + "source": [ + "### 2. Set up environment.\n", "\n", - "```py\n", - "# TODO: Consider separating them into two lists: `COLLECTIONS_TO_DUMP` and `COLLECTIONS_TO_RESTORE`. Or, make a list of collections that I will manually delete from the origin server after running this notebook.\n", - "```" + "Here, you'll prepare an environment for running this notebook.\n", + "\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "09966b0d", + "id": "8aee55e3", "metadata": {}, "outputs": [], "source": [ - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_unknown.py\n", - "from_X_to_unknown = [\n", - " \"omics_processing_set\",\n", - " \"pooling_set\",\n", - " \"library_preparation_set\",\n", - " \"extraction_set\"\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR23.py\n", - "from_10_2_0_to_PR23 = [\n", - " \"metagenome_assembly_set\",\n", - " \"metagenome_annotation_activity_set\",\n", - " \"metatranscriptome_activity_set\",\n", - " \"mags_activity_set\",\n", - " \"metagenome_sequencing_activity_set\",\n", - " \"read_qc_analysis_activity_set\",\n", - " \"read_based_taxonomy_analysis_activity_set\",\n", - " \"metabolomics_analysis_activity_set\",\n", - " \"metaproteomics_analysis_activity_set\",\n", - " \"nom_analysis_activity_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR4.py\n", - "from_PR23_to_PR4 = [\n", - " \"omics_processing_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR53.py\n", - "from_PR4_to_PR53 = [\n", - " \"omics_processing_set\",\n", - " \"biosample_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR21.py\n", - "from_PR53_to_PR21 = [\n", - " \"study_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR129.py\n", - "from_PR21_to_PR129 = [\n", - " \"metabolomics_analysis_activity_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR31.py\n", - "from_PR129_to_PR31 = [\n", - " \"mags_activity_set\",\n", - " \"metabolomics_analysis_activity_set\",\n", - " \"metagenome_annotation_activity_set\",\n", - " \"metagenome_assembly_set\",\n", - " \"metagenome_sequencing_activity_set\",\n", - " \"metatranscriptome_activity_set\",\n", - " \"nom_analysis_activity_set\",\n", - " \"omics_processing_set\",\n", - " \"read_based_taxonomy_analysis_activity_set\",\n", - " \"read_qc_analysis_activity_set\"\n", - " \"metaproteomics_analysis_activity_set\"\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR9.py\n", - "from_PR31_to_PR9 = [\n", - " \"metagenome_sequencing_activity_set\",\n", - " \"read_qc_analysis_activity_set\",\n", - " \"metagenome_assembly_set\",\n", - " \"read_based_taxonomy_analysis_activity_set\",\n", - " \"metagenome_annotation_activity_set\",\n", - " \"mags_activity_set\",\n", - " \"metabolomics_analysis_activity_set\",\n", - " \"nom_analysis_activity_set\",\n", - " \"metatranscriptome_activity_set\",\n", - " \"metaproteomics_analysis_activity_set\",\n", - "\n", - " \"omics_processing_set\",\n", - " \"workflow_chain_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR19_and_PR70.py\n", - "from_PR9_to_PR19_PR70 = [\n", - " \"instrument_set\",\n", - " \"omics_processing_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR2_and_PR24.py\n", - "from_PR19_PR70_to_PR2_PR24 = [\n", - " \"omics_processing_set\", \n", - " \"data_generation_set\",\n", - "\n", - " \"mags_activity_set\", \n", - " \"mags_set\",\n", - " \n", - " \"metabolomics_analysis_activity_set\", \n", - " \"metabolomics_analysis_set\",\n", - " \n", - " \"metagenome_annotation_activity_set\", \n", - " \"metagenome_annotation_set\",\n", - " \n", - " \"metagenome_sequencing_activity_set\", \n", - " \"metagenome_sequencing_set\",\n", - " \n", - " \"metaproteomics_analysis_activity_set\", \n", - " \"metaproteomics_analysis_set\",\n", - " \n", - " \"metatranscriptome_activity_set\",\n", - " \"metatranscriptome_analysis_set\",\n", - " \n", - " \"nom_analysis_activity_set\",\n", - " \"nom_analysis_set\",\n", - " \n", - " \"read_based_taxonomy_analysis_activity_set\",\n", - " \"read_based_taxonomy_analysis_set\",\n", - " \n", - " \"read_qc_analysis_activity_set\",\n", - " \"read_qc_analysis_set\",\n", - " \n", - " \"activity_set\",\n", - " \"workflow_execution_set\" \n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR10.py\n", - "from_PR2_PR24_to_PR10 = [\n", - " \"biosample_set\",\n", - " \"data_object_set\",\n", - " \"functional_annotation_agg\",\n", - " \"study_set\",\n", - " \"extraction_set\",\n", - " \"field_research_site_set\",\n", - " \"library_preparation_set\",\n", - " \"mags_set\",\n", - " \"metabolomics_analysis_set\",\n", - " \"metagenome_annotation_set\",\n", - " \"metagenome_assembly_set\",\n", - " \"metagenome_sequencing_set\",\n", - " \"metaproteomics_analysis_set\",\n", - " \"metatranscriptome_analysis_set\",\n", - " \"nom_analysis_set\",\n", - " \"data_generation_set\",\n", - " \"pooling_set\",\n", - " \"processed_sample_set\",\n", - " \"read_based_taxonomy_analysis_set\",\n", - " \"read_qc_analysis_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR3.py\n", - "from_PR10_to_PR3 = [\n", - " \"data_generation_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR176.py\n", - "from_X_to_PR176 = [\n", - " \"read_qc_analysis_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR176_to_PR104.py\n", - "from_PR176_to_PR104 = [\n", - " \"data_generation_set\"\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_X_to_PR192.py\n", - "from_X_to_PR192 = [\n", - " \"extraction_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/partials/migrator_from_10_2_0_to_11_0_0/migrator_from_X_to_PR104.py\n", - "from_X_to_PR104 = [\n", - " \"workflow_execution_set\",\n", - " \"metagenome_annotation_set\",\n", - " \"metagenome_assembly_set\",\n", - " \"metatranscriptome_assembly_set\",\n", - " \"metatranscriptome_annotation_set\",\n", - " \"metatranscriptome_analysis_set\",\n", - " \"mags_set\",\n", - " \"metagenome_sequencing_set\",\n", - " \"read_qc_analysis_set\",\n", - " \"read_based_taxonomy_analysis_set\",\n", - " \"metabolomics_analysis_set\",\n", - " \"metaproteomics_analysis_set\",\n", - " \"nom_analysis_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR104_to_PR195.py\n", - "from_PR104_to_PR195 = [\n", - " \"collecting_biosamples_from_site_set\",\n", - " \"protocol_execution_set\",\n", - " \"storage_process_set\",\n", - " \"material_processing_set\",\n", - " \"pooling_set\",\n", - " \"extraction_set\",\n", - " \"library_preparation_set\",\n", - " \"sub_sampling_process_set\",\n", - " \"mixing_process_set\",\n", - " \"filtration_process_set\",\n", - " \"chromatographic_separation_process_set\",\n", - " \"dissolving_process_set\",\n", - " \"chemical_conversion_process_set\",\n", - " \"data_generation_set\",\n", - " \"nucleotide_sequencing_set\",\n", - " \"mass_spectrometry_set\",\n", - " \"workflow_chain_set\",\n", - " \"workflow_execution_set\",\n", - " \"metagenome_annotation_set\",\n", - " \"metagenome_assembly_set\",\n", - " \"metatranscriptome_assembly_set\",\n", - " \"metatranscriptome_annotation_set\",\n", - " \"metatranscriptome_analysis_set\",\n", - " \"mags_set\",\n", - " \"metagenome_sequencing_set\",\n", - " \"read_qc_analysis_set\",\n", - " \"read_based_taxonomy_analysis_set\",\n", - " \"metabolomics_analysis_set\",\n", - " \"metaproteomics_analysis_set\",\n", - " \"nom_analysis_set\",\n", - "]\n", - "\n", - "# https://github.com/microbiomedata/berkeley-schema-fy24/blob/main/nmdc_schema/migrators/migrator_from_PR195_to_unknown.py\n", - "from_PR195_to_unknown = [\n", - " \"workflow_execution_set\",\n", - " \"workflow_chain_set\",\n", - "]\n", - "\n", - "# Note: `*arr` in Python is like `...arr` in JavaScript (it's a \"spread\" operator).\n", - "COLLECTION_NAMES: list[str] = [\n", - " *from_X_to_unknown,\n", - " *from_10_2_0_to_PR23,\n", - " *from_PR23_to_PR4,\n", - " *from_PR4_to_PR53,\n", - " *from_PR53_to_PR21,\n", - " *from_PR21_to_PR129,\n", - " *from_PR129_to_PR31,\n", - " *from_PR31_to_PR9,\n", - " *from_PR9_to_PR19_PR70,\n", - " *from_PR19_PR70_to_PR2_PR24,\n", - " *from_PR2_PR24_to_PR10,\n", - " *from_PR10_to_PR3,\n", - " *from_X_to_PR176,\n", - " *from_PR176_to_PR104,\n", - " *from_X_to_PR192,\n", - " *from_X_to_PR104,\n", - " *from_PR104_to_PR195,\n", - " *from_PR195_to_unknown,\n", - "]\n", - "print(str(len(COLLECTION_NAMES)) + \" collection names\")\n", - "\n", - "# Eliminate duplicates.\n", - "COLLECTION_NAMES = list(set(COLLECTION_NAMES))\n", - "print(str(len(COLLECTION_NAMES)) + \" collection names (distinct)\")" + "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4" ] }, { "cell_type": "markdown", - "id": "17f351e8", + "id": "22f5c78f", "metadata": {}, "source": [ - "### 2. Coordinate with stakeholders.\n", - "\n", - "We will be enacting full Runtime and Database downtime for this migration. Ensure stakeholders are aware of that." + "2. Delete **obsolete dumps** from previous notebooks runs.\n", + " 1. This is so the dumps you generate below will not be mixed in with any unrelated ones." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c70b6715", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {cfg.origin_dump_folder_path}\n", + "!rm -rf {cfg.transformer_dump_folder_path}" ] }, { "cell_type": "markdown", - "id": "233a35c3", + "id": "6cd05ccb", "metadata": {}, "source": [ - "### 3. Set up environment.\n", + "3. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use `.notebook.env.example` as a template.\n", + "4. Create and populate the two **MongoDB configuration files**—`.mongo.origin.yaml` and `.mongo.transformer.yaml`—that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers, respectively. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", + " 1. You can use `.mongo.yaml.example` as a template.\n", "\n", - "Here, you'll prepare an environment for running this notebook.\n", - "\n", - "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", - " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n", - " ```shell\n", - " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4\n", - " ```\n", - "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", - " 1. You can use `.notebook.env.example` as a template:\n", - " ```shell\n", - " $ cp .notebook.env.example .notebook.env\n", - " ```\n", - "3. Create and populate the two **MongoDB configuration files** that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", - " 1. You can use `.mongo.yaml.example` as a template:\n", - " ```shell\n", - " $ cp .mongo.yaml.example .mongo.origin.yaml\n", - " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", - " ```\n", - " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database.\n", - "\n", - "- TODO: Be more specific about the Mongo privileges necessary to perform a `mongodump` and a `mongorestore` that may involve creating/deleting collections." + "- TODO: Consolidate config files!" ] }, { @@ -361,8 +116,8 @@ "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook cells) now.\n", "\n", "References: \n", - "- https://pypi.org/project/nmdc-schema/\n", - "- https://github.com/microbiomedata/berkeley-schema-fy24\n", + "- Berkeley Schema PyPI package (it's version 11+ of the `nmdc-schema` package): https://pypi.org/project/nmdc-schema/\n", + "- Berkeley Schema GitHub repo: https://github.com/microbiomedata/berkeley-schema-fy24\n", "- How to `pip install` a Git branch: https://stackoverflow.com/a/20101940" ] }, @@ -399,6 +154,8 @@ "source": [ "# Standard library packages:\n", "import subprocess\n", + "import logging\n", + "from typing import List\n", "\n", "# Third-party packages:\n", "import pymongo\n", @@ -432,14 +189,14 @@ "cfg = Config()\n", "\n", "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", - "mongodump = cfg.mongodump_path\n", + "mongodump = cfg.mongodump_path\n", "mongorestore = cfg.mongorestore_path\n", - "mongosh = cfg.mongosh_path\n", + "mongosh = cfg.mongosh_path\n", "\n", "# Perform a sanity test of the application paths.\n", - "!{mongodump} --version\n", + "!{mongodump} --version\n", "!{mongorestore} --version\n", - "!{mongosh} --version" + "!{mongosh} --version" ] }, { @@ -523,9 +280,7 @@ "\n", "We revoke \"write\" access so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data.\n", "\n", - "We also revoke \"read\" access. The revocation of \"read\" access is technically optional, but (a) the JavaScript script will be easier for me to maintain if it revokes everything and (b) this prevents people from reading data during the restore step, during which the database may not be self-consistent.\n", - "\n", - "- TODO: Consider allowing \"read\" access until _just before_ the restore step.\n", + "We also revoke \"read\" access. The revocation of \"read\" access is technically optional, but (a) the JavaScript mongosh script will be easier for me to maintain if it revokes everything and (b) this prevents people from reading data during the restore step, during which the database may not be self-consistent.\n", "\n", "References:\n", "\n", @@ -564,27 +319,6 @@ "print(f\"\\nReturn code: {completed_process.returncode}\")" ] }, - { - "cell_type": "markdown", - "id": "e60c8935", - "metadata": {}, - "source": [ - "### Delete obsolete dumps\n", - "\n", - "Delete any existing dumps so that the dumps you generate below will not be mixed in with any unrelated ones." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df48c2ce", - "metadata": {}, - "outputs": [], - "source": [ - "!rm -rf {cfg.origin_dump_folder_path}\n", - "!rm -rf {cfg.transformer_dump_folder_path}" - ] - }, { "cell_type": "markdown", "id": "fd4994a0", @@ -592,26 +326,7 @@ "source": [ "### Dump collections from the \"origin\" MongoDB server\n", "\n", - "Use `mongodump` to dump the collections involved in this migration **from** the \"origin\" MongoDB server **into** a local directory.\n", - "\n", - "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved.\n", - "\n", - "- TODO: Consider ensuring that the local dump target folder is empty before doing this dump." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "831ac241", - "metadata": {}, - "outputs": [], - "source": [ - "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", - "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", - "non_agenda_collection_names = [name for name in all_collection_names if name not in COLLECTION_NAMES]\n", - "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", - "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", - "print(exclusion_options_str)" + "Use `mongodump` to dump all the collections **from** the \"origin\" MongoDB server **into** a local directory." ] }, { @@ -621,16 +336,12 @@ "metadata": {}, "outputs": [], "source": [ - "# FIXME: Temporarily excluding nothing so that everything gets dumped!\n", - "exclusion_options_str = \"\"\n", - "\n", - "# Dump the not-excluded collections from the \"origin\" database.\n", + "# Dump all collections from the \"origin\" database.\n", "!{mongodump} \\\n", " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", " --db=\"nmdc\" \\\n", " --gzip \\\n", - " --out=\"{cfg.origin_dump_folder_path}\" \\\n", - " {exclusion_options_str}" + " --out=\"{cfg.origin_dump_folder_path}\"" ] }, { @@ -640,26 +351,7 @@ "source": [ "### Load the dumped collections into the \"transformer\" MongoDB server\n", "\n", - "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", - "\n", - "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database.\n", - "\n", - "> Note: This step typically takes 3 minutes (on a MacBook Pro M1, when running MongoDB in a Docker container).\n", - "\n", - "- TODO: Are \"views\" included in `mongodump` dumps? If so, how does `mongorestore` handle them—does it restore them as \"views\" or as normal \"collections\"?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4acae55", - "metadata": {}, - "outputs": [], - "source": [ - "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", - "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", - "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", - "print(inclusion_options_str)" + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server." ] }, { @@ -669,9 +361,6 @@ "metadata": {}, "outputs": [], "source": [ - "# FIXME: Temporarily include nothing explicitly so that everything gets restored!\n", - "inclusion_options_str = \"\"\n", - "\n", "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", "!{mongorestore} \\\n", " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", @@ -679,8 +368,7 @@ " --drop \\\n", " --preserveUUID \\\n", " --stopOnError \\\n", - " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", - " {inclusion_options_str}" + " --dir=\"{cfg.origin_dump_folder_path}\"" ] }, { @@ -706,8 +394,6 @@ "metadata": {}, "outputs": [], "source": [ - "import logging\n", - "\n", "# Setup a logger that writes to a file.\n", "# TODO: Move this logger stuff to `helpers.py`.`\n", "LOG_FILE_PATH = \"./tmp.log\"\n", @@ -751,11 +437,7 @@ "source": [ "### Validate the transformed documents\n", "\n", - "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server.\n", - "\n", - "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n", - "\n", - "- TODO: Consider validating the (large) `functional_annotation_agg` collection _last_ so we find out about validation errors, if any, in _other_ (smaller) collections sooner." + "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server." ] }, { @@ -765,18 +447,21 @@ "metadata": {}, "outputs": [], "source": [ - "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `COLLECTION_NAMES`,\n", + "# Make a list of all slots of the `Database` class in the schema.\n", + "#\n", + "# TODO: Use a SchemaView for this instead of directly accessing the JSON Schema dictionary.\n", + "#\n", + "database_slot_names: List[str] = nmdc_jsonschema[\"$defs\"][\"Database\"][\"properties\"]\n", + "\n", + "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `database_slot_names`,\n", "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", "# other collections without having to wait for that (large) collection to be validated before them.\n", - "ordered_collection_names = sorted(COLLECTION_NAMES.copy())\n", + "ordered_collection_names = sorted(database_slot_names.copy())\n", "large_collection_name = \"functional_annotation_agg\"\n", "if large_collection_name in ordered_collection_names:\n", " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", " ordered_collection_names.append(large_collection_name)\n", "\n", - "# TODO: Only validate documents in the collections that we will be restoring.\n", - "# Note: If a collection listed in `COLLECTION_NAMES` doesn't exist in the transformation\n", - "# database anymore, the inner `for` loop will just have zero iterations.\n", "for collection_name in ordered_collection_names:\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", " num_documents_in_collection = collection.count_documents({})\n", @@ -822,8 +507,7 @@ " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", " --db=\"nmdc\" \\\n", " --gzip \\\n", - " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", - " {exclusion_options_str}" + " --out=\"{cfg.transformer_dump_folder_path}\"" ] }, { @@ -889,9 +573,7 @@ "\n", "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", "\n", - "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually.\n", - "\n", - "> Estimated time when running on laptop: 17 minutes" + "- TODO: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." ] }, { @@ -901,15 +583,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "# Load the transformed collections into the origin server, replacing any same-named ones that are there.\n", "!{mongorestore} \\\n", " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", " --gzip \\\n", " --verbose \\\n", " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", " --drop --preserveUUID \\\n", - " --stopOnError \\\n", - " {inclusion_options_str}" + " --stopOnError" ] }, { @@ -941,7 +622,7 @@ "id": "04c856a8", "metadata": {}, "source": [ - "### Restore access to the MongoDB server\n", + "### Restore access to the \"origin\" MongoDB server\n", "\n", "This effectively un-does the access revocation that we did earlier." ] From 38ecdf04a2f33a6139cf81d377d17c72c28ec38a Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 23:09:59 -0700 Subject: [PATCH 17/27] Rename Berkeley migration notebook to reflect initial schema version --- ...rate_10_2_0_to_11_0_0.ipynb => migrate_10_5_6_to_11_0_0.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename demo/metadata_migration/notebooks/{migrate_10_2_0_to_11_0_0.ipynb => migrate_10_5_6_to_11_0_0.ipynb} (100%) diff --git a/demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb similarity index 100% rename from demo/metadata_migration/notebooks/migrate_10_2_0_to_11_0_0.ipynb rename to demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb From d5ef873416e6896925a23bdf719ce4dc3d19925a Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 23:26:06 -0700 Subject: [PATCH 18/27] Move logging setup code to `helpers.py` file --- demo/metadata_migration/notebooks/helpers.py | 35 +++++++++++++++++-- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 29 ++------------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index bfcdd9cf..69ccc6de 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -1,6 +1,8 @@ from pathlib import Path import re -from typing import Dict +from typing import Dict, Optional +import logging +from datetime import datetime from dotenv import dotenv_values import yaml @@ -145,4 +147,33 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: self.transformer_mongo_host = notebook_config["transformer_mongo_host"] self.transformer_mongo_port = notebook_config["transformer_mongo_port"] self.transformer_mongo_username = notebook_config["transformer_mongo_username"] - self.transformer_mongo_password = notebook_config["transformer_mongo_password"] \ No newline at end of file + self.transformer_mongo_password = notebook_config["transformer_mongo_password"] + + +def setup_logger( + log_file_path: Optional[str] = None, + logger_name: str = "migrator_logger", + log_level: int = logging.DEBUG, +) -> logging.Logger: + r""" + Returns a logger that writes to a file at the specified log file path + (default: "./{YYYYMMDD_HHMM}_migration.log"). + """ + + # If no log file path was specified, generate one. + if log_file_path is None: + yyyymmdd_hhmm: str = datetime.now().strftime("%Y%m%d_%H%M") # YYYYMMDD_HHMM + log_file_path = f"./{yyyymmdd_hhmm}_migration.log" + + logger = logging.getLogger(name=logger_name) + logger.setLevel(level=log_level) + file_handler = logging.FileHandler(log_file_path) + formatter = logging.Formatter( + fmt="[%(asctime)s %(name)s %(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + file_handler.setFormatter(formatter) + if logger.hasHandlers(): + logger.handlers.clear() # avoids duplicate log entries + logger.addHandler(file_handler) + return logger diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index 19887ce5..f455198a 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -154,7 +154,6 @@ "source": [ "# Standard library packages:\n", "import subprocess\n", - "import logging\n", "from typing import List\n", "\n", "# Third-party packages:\n", @@ -165,7 +164,7 @@ "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", - "from helpers import Config\n", + "from helpers import Config, setup_logger\n", "from bookkeeper import Bookkeeper, MigrationEvent" ] }, @@ -382,30 +381,7 @@ "\n", "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", "\n", - "> Reminder: The \"origin\" database is **not** affected by this step.\n", - "\n", - "- TODO: Consider deleting the existing log or appending a timestamp to the log filename." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df8ee3da", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup a logger that writes to a file.\n", - "# TODO: Move this logger stuff to `helpers.py`.`\n", - "LOG_FILE_PATH = \"./tmp.log\"\n", - "logger = logging.getLogger(name=\"migrator_logger\")\n", - "logger.setLevel(logging.DEBUG)\n", - "file_handler = logging.FileHandler(LOG_FILE_PATH)\n", - "formatter = logging.Formatter(fmt=\"%(asctime)s\\t%(name)s\\t%(levelname)s\\t%(message)s\",\n", - " datefmt=\"%Y-%m-%d %H:%M:%S\")\n", - "file_handler.setFormatter(formatter)\n", - "if logger.hasHandlers():\n", - " logger.handlers.clear() # avoid duplicate log entries\n", - "logger.addHandler(file_handler)" + "> Reminder: The \"origin\" database is **not** affected by this step." ] }, { @@ -424,6 +400,7 @@ ")\n", "\n", "# Instantiate a Migrator bound to that adapter.\n", + "logger = setup_logger()\n", "migrator = Migrator(adapter=adapter, logger=logger)\n", "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", From e1709c4012bb354f204bd36702999df504a2b548 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 23:32:29 -0700 Subject: [PATCH 19/27] Import `Draft7Validator` with an alias to facilitate swapping it out --- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index f455198a..74fb4e54 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -93,7 +93,7 @@ "4. Create and populate the two **MongoDB configuration files**—`.mongo.origin.yaml` and `.mongo.transformer.yaml`—that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers, respectively. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", " 1. You can use `.mongo.yaml.example` as a template.\n", "\n", - "- TODO: Consolidate config files!" + "- TODO: Consolidate config files." ] }, { @@ -142,7 +142,9 @@ "source": [ "### Import Python dependencies\n", "\n", - "Import the Python objects upon which this notebook depends." + "Import the Python objects upon which this notebook depends.\n", + "\n", + "- TODO: Consider whether the JSON Schema Validator version (e.g. `Draft7Validator` versus `Draft201909Validator`) is consistent with the JSON Schema version." ] }, { @@ -158,7 +160,7 @@ "\n", "# Third-party packages:\n", "import pymongo\n", - "from jsonschema import Draft7Validator\n", + "from jsonschema import Draft7Validator as JSONSchemaValidator\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", @@ -188,14 +190,14 @@ "cfg = Config()\n", "\n", "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", - "mongodump = cfg.mongodump_path\n", + "mongodump = cfg.mongodump_path\n", "mongorestore = cfg.mongorestore_path\n", - "mongosh = cfg.mongosh_path\n", + "mongosh = cfg.mongosh_path\n", "\n", "# Perform a sanity test of the application paths.\n", - "!{mongodump} --version\n", + "!{mongodump} --version\n", "!{mongorestore} --version\n", - "!{mongosh} --version" + "!{mongosh} --version" ] }, { @@ -245,9 +247,7 @@ "source": [ "### Create JSON Schema validator\n", "\n", - "In this step, you'll create a JSON Schema validator for the NMDC Schema.\n", - "\n", - "- TODO: Consider whether the JSON Schema validator version is consistent with the JSON Schema version (e.g. draft 7 versus draft 2019)." + "In this step, you'll create a JSON Schema validator for the NMDC Schema." ] }, { @@ -260,7 +260,7 @@ "outputs": [], "source": [ "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict(variant=SchemaVariantIdentifier.nmdc_materialized_patterns)\n", - "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", + "nmdc_jsonschema_validator = JSONSchemaValidator(nmdc_jsonschema)\n", "\n", "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", From c2026595c9149dd55d15e7a81db929464ef5b919 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 11 Jul 2024 23:47:49 -0700 Subject: [PATCH 20/27] Convert all `!` commands into Python `subprocess.run` calls --- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 98 +++++++++++++------ 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index 74fb4e54..88acc05d 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -302,9 +302,6 @@ "# if the command were run via a `!` command (since, to me, the latter more closely resembles\n", "# regular shell usage).\n", "#\n", - "# TODO: Revisit this; and consider switching all the other `!` commands to use `subprocess`\n", - "# so that this notebook is closer to becoming a regular Python script.\n", - "#\n", "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", " --host='{cfg.origin_mongo_host}' \\\n", @@ -336,11 +333,19 @@ "outputs": [], "source": [ "# Dump all collections from the \"origin\" database.\n", - "!{mongodump} \\\n", - " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", - " --db=\"nmdc\" \\\n", - " --gzip \\\n", - " --out=\"{cfg.origin_dump_folder_path}\"" + "shell_command = f\"\"\"\n", + " {mongodump} \\\n", + " --host='{cfg.origin_mongo_host}' \\\n", + " --port='{cfg.origin_mongo_port}' \\\n", + " --username='{cfg.origin_mongo_username}' \\\n", + " --password='{cfg.origin_mongo_password}' \\\n", + " --authenticationDatabase='admin' \\\n", + " --db='nmdc' \\\n", + " --gzip \\\n", + " --out='{cfg.origin_dump_folder_path}'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" ] }, { @@ -361,13 +366,21 @@ "outputs": [], "source": [ "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", - "!{mongorestore} \\\n", - " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", - " --gzip \\\n", - " --drop \\\n", - " --preserveUUID \\\n", - " --stopOnError \\\n", - " --dir=\"{cfg.origin_dump_folder_path}\"" + "shell_command = f\"\"\"\n", + " {mongorestore} \\\n", + " --host='{cfg.transformer_mongo_host}' \\\n", + " --port='{cfg.transformer_mongo_port}' \\\n", + " --username='{cfg.transformer_mongo_username}' \\\n", + " --password='{cfg.transformer_mongo_password}' \\\n", + " --authenticationDatabase='admin' \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --stopOnError \\\n", + " --dir='{cfg.origin_dump_folder_path}'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" ] }, { @@ -480,11 +493,19 @@ "outputs": [], "source": [ "# Dump the database from the \"transformer\" MongoDB server.\n", - "!{mongodump} \\\n", - " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", - " --db=\"nmdc\" \\\n", - " --gzip \\\n", - " --out=\"{cfg.transformer_dump_folder_path}\"" + "shell_command = f\"\"\"\n", + " {mongodump} \\\n", + " --host='{cfg.transformer_mongo_host}' \\\n", + " --port='{cfg.transformer_mongo_port}' \\\n", + " --username='{cfg.transformer_mongo_username}' \\\n", + " --password='{cfg.transformer_mongo_password}' \\\n", + " --authenticationDatabase='admin' \\\n", + " --db='nmdc' \\\n", + " --gzip \\\n", + " --out='{cfg.transformer_dump_folder_path}'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\") " ] }, { @@ -536,9 +557,17 @@ "source": [ "### TODO: Drop the original collections from the \"origin\" MongoDB server\n", "\n", - "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump. We may need `mongosh` for this.\n", - "\n", - "- TODO: Now that the notebook does depend upon `mongosh`, revisit filling in this step." + "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0b26e434", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO!" ] }, { @@ -561,13 +590,22 @@ "outputs": [], "source": [ "# Load the transformed collections into the origin server, replacing any same-named ones that are there.\n", - "!{mongorestore} \\\n", - " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", - " --gzip \\\n", - " --verbose \\\n", - " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", - " --drop --preserveUUID \\\n", - " --stopOnError" + "shell_command = f\"\"\"\n", + " {mongorestore} \\\n", + " --host='{cfg.origin_mongo_host}' \\\n", + " --port='{cfg.origin_mongo_port}' \\\n", + " --username='{cfg.origin_mongo_username}' \\\n", + " --password='{cfg.origin_mongo_password}' \\\n", + " --authenticationDatabase='admin' \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir='{cfg.transformer_dump_folder_path}' \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --stopOnError\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\") " ] }, { From 60ed00c2be5fc40ad112732bb117bcb64df4ebac Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 12 Jul 2024 00:03:54 -0700 Subject: [PATCH 21/27] Consolidate config files into `.notebook.env` --- demo/metadata_migration/notebooks/.gitignore | 4 +- .../notebooks/.mongo.yaml.example | 29 -------- .../notebooks/.notebook.env.example | 9 --- demo/metadata_migration/notebooks/helpers.py | 68 ++----------------- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 20 ++++-- 5 files changed, 19 insertions(+), 111 deletions(-) delete mode 100644 demo/metadata_migration/notebooks/.mongo.yaml.example diff --git a/demo/metadata_migration/notebooks/.gitignore b/demo/metadata_migration/notebooks/.gitignore index 255005c5..20949554 100644 --- a/demo/metadata_migration/notebooks/.gitignore +++ b/demo/metadata_migration/notebooks/.gitignore @@ -1,4 +1,4 @@ /.notebook.env -/.mongo.*.yaml /mongodump.*.out -/tmp.* \ No newline at end of file +/tmp.* +/*_migration.log \ No newline at end of file diff --git a/demo/metadata_migration/notebooks/.mongo.yaml.example b/demo/metadata_migration/notebooks/.mongo.yaml.example deleted file mode 100644 index a1df2e02..00000000 --- a/demo/metadata_migration/notebooks/.mongo.yaml.example +++ /dev/null @@ -1,29 +0,0 @@ -# MongoDB client configuration file for connecting to a MongoDB server. -# -# Instructions: -# -# 1. Update `uri` so it contains the MongoDB server connection string. -# -# Syntax: -# mongodb://{user}:{password}@{host}:{port}/?authSource={auth_database} -# -# Example: -# Assuming username is "root", password is "pass", host is "localhost", -# port is "27017", and name of authentication database is "admin", -# the value of `uri` would be: -# ``` -# mongodb://root:pass@localhost:27017/?authSource=admin -# ``` -# -# Example: -# Assuming the same scenario as in the previous example, but without -# access control enabled (i.e. no username/password), -# the value of `uri` would be: -# ``` -# mongodb://localhost:27017/ -# ``` -# -# Reference: -# https://www.mongodb.com/docs/database-tools/mongodump/#std-option-mongodump.--uri -# -uri: mongodb://user:pass@localhost:27017/?authSource=admin \ No newline at end of file diff --git a/demo/metadata_migration/notebooks/.notebook.env.example b/demo/metadata_migration/notebooks/.notebook.env.example index 6bed63ec..187c7197 100644 --- a/demo/metadata_migration/notebooks/.notebook.env.example +++ b/demo/metadata_migration/notebooks/.notebook.env.example @@ -1,7 +1,3 @@ -# Paths to Mongo config files. -PATH_TO_ORIGIN_MONGO_CONFIG_FILE = "./.mongo.origin.yaml" -PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE = "./.mongo.transformer.yaml" - # Paths to folders in which the notebook will store Mongo dumps. PATH_TO_ORIGIN_MONGO_DUMP_FOLDER = "./mongodump.origin.out" PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER = "./mongodump.transformer.out" @@ -11,11 +7,6 @@ PATH_TO_MONGODUMP_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mong PATH_TO_MONGORESTORE_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongorestore" PATH_TO_MONGOSH_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongosh-1.10.6-darwin-x64/bin/mongosh" -# TODO: Now that the notebooks require both (a) Mongo config files — see above, and -# (b) discrete connection parameters — see below, consider using the parameters -# below to either (I) dynamically create a Mongo config file, or (II) use them -# directly as CLI options to `mongodump` and `mongorestore`. - # Connection parameters for the Origin Mongo server (typically a remote serve). ORIGIN_MONGO_HOST="__REPLACE_ME__" ORIGIN_MONGO_PORT="__REPLACE_ME__" diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index 69ccc6de..af0774b4 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -1,11 +1,9 @@ from pathlib import Path -import re from typing import Dict, Optional import logging from datetime import datetime from dotenv import dotenv_values -import yaml class Config: @@ -23,27 +21,9 @@ def parse_and_validate_notebook_config_file( # Parse the notebook config file. notebook_config = dotenv_values(notebook_config_file_path) - # Validate the Mongo config file paths. - origin_mongo_config_file_path = notebook_config[ - "PATH_TO_ORIGIN_MONGO_CONFIG_FILE" - ] - transformer_mongo_config_file_path = notebook_config[ - "PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE" - ] - if not Path(origin_mongo_config_file_path).is_file(): - raise FileNotFoundError( - f"Origin Mongo config file not found at: {origin_mongo_config_file_path}" - ) - if not Path(transformer_mongo_config_file_path).is_file(): - raise FileNotFoundError( - f"Transformer Mongo config file not found at: {transformer_mongo_config_file_path}" - ) - # Validate the dump folder paths. origin_dump_folder_path = notebook_config["PATH_TO_ORIGIN_MONGO_DUMP_FOLDER"] - transformer_dump_folder_path = notebook_config[ - "PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER" - ] + transformer_dump_folder_path = notebook_config["PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER"] if not Path(origin_dump_folder_path).parent.is_dir(): raise FileNotFoundError( f"Parent folder of {origin_dump_folder_path} (origin Mongo dump folder path) not found." @@ -60,9 +40,7 @@ def parse_and_validate_notebook_config_file( if not Path(mongodump_path).is_file(): raise FileNotFoundError(f"mongodump binary not found at: {mongodump_path}") if not Path(mongorestore_path).is_file(): - raise FileNotFoundError( - f"mongorestore binary not found at: {mongorestore_path}" - ) + raise FileNotFoundError(f"mongorestore binary not found at: {mongorestore_path}") if not Path(mongosh_path).is_file(): raise FileNotFoundError(f"mongosh binary not found at: {mongosh_path}") @@ -77,8 +55,6 @@ def parse_and_validate_notebook_config_file( transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"] return dict( - origin_mongo_config_file_path=origin_mongo_config_file_path, - transformer_mongo_config_file_path=transformer_mongo_config_file_path, origin_dump_folder_path=origin_dump_folder_path, transformer_dump_folder_path=transformer_dump_folder_path, mongodump_path=mongodump_path, @@ -94,50 +70,14 @@ def parse_and_validate_notebook_config_file( transformer_mongo_password=transformer_mongo_password, ) - def parse_and_validate_mongo_config_file( - self, mongo_config_file_path: str - ) -> Dict[str, str]: - # Parse the Mongo config files as YAML. - with open(mongo_config_file_path, "r") as file: - mongo_config = yaml.safe_load(file) - - # Validate the connection string. - uri = mongo_config["uri"] - if not re.match( - r"^mongodb:\/\/.*", uri - ): # note: this is a sanity test, not a comprehensive test - raise ValueError(f"uri value in {mongo_config_file_path} is invalid.") - - return dict(uri=uri) - def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: # Parse and validate the notebook config file. - notebook_config = self.parse_and_validate_notebook_config_file( - notebook_config_file_path - ) + notebook_config = self.parse_and_validate_notebook_config_file(notebook_config_file_path) self.mongodump_path = notebook_config["mongodump_path"] self.mongorestore_path = notebook_config["mongorestore_path"] self.mongosh_path = notebook_config["mongosh_path"] self.origin_dump_folder_path = notebook_config["origin_dump_folder_path"] - self.transformer_dump_folder_path = notebook_config[ - "transformer_dump_folder_path" - ] - - # Parse and validate the Mongo config files. - self.origin_mongo_config_file_path = notebook_config[ - "origin_mongo_config_file_path" - ] - self.transformer_mongo_config_file_path = notebook_config[ - "transformer_mongo_config_file_path" - ] - origin_mongo_server_config = self.parse_and_validate_mongo_config_file( - self.origin_mongo_config_file_path - ) - transformer_mongo_server_config = self.parse_and_validate_mongo_config_file( - self.transformer_mongo_config_file_path - ) - self.origin_mongo_server_uri = origin_mongo_server_config["uri"] - self.transformer_mongo_server_uri = transformer_mongo_server_config["uri"] + self.transformer_dump_folder_path = notebook_config["transformer_dump_folder_path"] # Parse the Mongo connection parameters. self.origin_mongo_host = notebook_config["origin_mongo_host"] diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index 88acc05d..86fb80f3 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -50,7 +50,7 @@ "Here, you'll prepare an environment for running this notebook.\n", "\n", "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", - " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n" + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command. A MongoDB server started this way will be accessible without a username or password.\n" ] }, { @@ -90,10 +90,8 @@ "source": [ "3. Create and populate a **notebook configuration file** named `.notebook.env`.\n", " 1. You can use `.notebook.env.example` as a template.\n", - "4. Create and populate the two **MongoDB configuration files**—`.mongo.origin.yaml` and `.mongo.transformer.yaml`—that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers, respectively. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", - " 1. You can use `.mongo.yaml.example` as a template.\n", - "\n", - "- TODO: Consolidate config files." + " 2. The \"origin\" MongoDB server is the one that contains the database you want to migrate.\n", + " 3. The \"transformer\" MongoDB server is the one you want to use to perform the data transformations." ] }, { @@ -218,10 +216,18 @@ "outputs": [], "source": [ "# Mongo client for \"origin\" MongoDB server.\n", - "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_host, \n", + " port=cfg.origin_mongo_port,\n", + " username=cfg.origin_mongo_username,\n", + " password=cfg.origin_mongo_password,\n", + " directConnection=True)\n", "\n", "# Mongo client for \"transformer\" MongoDB server.\n", - "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_host, \n", + " port=cfg.transformer_mongo_port,\n", + " username=cfg.transformer_mongo_username,\n", + " password=cfg.transformer_mongo_password,\n", + " directConnection=True)\n", "\n", "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", "with pymongo.timeout(3):\n", From 4c1c6c9128e4b7e8748da70fdbb027fbaedb8741 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 9 Aug 2024 22:53:34 -0700 Subject: [PATCH 22/27] Add cell that drops `nmdc` database from transformer server --- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 97 +++++++++++++------ 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index 86fb80f3..93f0e4a1 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -17,9 +17,9 @@ "source": [ "## Introduction\n", "\n", - "This notebook will be used to migrate the database from `nmdc-schema` `v10.5.6` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v10.5.6) June 25, 2024) to `v11.0.0` (i.e. the initial version of the Berkeley schema).\n", + "This notebook will be used to migrate the database from `nmdc-schema` `v10.5.6` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v10.5.6) June 25, 2024) to `v11.0.0` (i.e. the initial version of the so-called \"Berkeley schema\").\n", "\n", - "Unlike previous migrators, this one does not pick and choose which collections it will dump. There are two reasons for this: (1) migrators no longer have a dedicated `self.agenda` dictionary that indicates all the collections involved in the migration; and (2) this migration is the first one that involves creating, renaming, and dropping any collections; none of which were things that the old `self.agenda`-based system were designed to handle. So, instead of picking and choosing collections, this migrator **dumps them all.**" + "Unlike previous migrators, this one does not pick and choose which collections it will dump. There are two reasons for this: (1) migrators no longer have a dedicated `self.agenda` dictionary that indicates all the collections involved in the migration; and (2) this migration is the first one that involves creating, renaming, and dropping any collections; none of which are things that the old `self.agenda`-based system was designed to handle. So, instead of picking and choosing collections, this migrator **dumps them all.**" ] }, { @@ -45,7 +45,7 @@ "id": "233a35c3", "metadata": {}, "source": [ - "### 2. Set up environment.\n", + "### 2. Set up notebook environment.\n", "\n", "Here, you'll prepare an environment for running this notebook.\n", "\n", @@ -69,7 +69,7 @@ "metadata": {}, "source": [ "2. Delete **obsolete dumps** from previous notebooks runs.\n", - " 1. This is so the dumps you generate below will not be mixed in with any unrelated ones." + " 1. This is so the dumps you generate below will not be merged with any unrelated ones." ] }, { @@ -107,16 +107,19 @@ "id": "fe81196a", "metadata": {}, "source": [ - "### Install Python dependencies\n", + "### Install Python packages\n", "\n", "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends.\n", "\n", - "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook cells) now.\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook cells), then proceed to the next cell.\n", "\n", - "References: \n", - "- Berkeley Schema PyPI package (it's version 11+ of the `nmdc-schema` package): https://pypi.org/project/nmdc-schema/\n", - "- Berkeley Schema GitHub repo: https://github.com/microbiomedata/berkeley-schema-fy24\n", - "- How to `pip install` a Git branch: https://stackoverflow.com/a/20101940" + "##### References\n", + "\n", + "| Description | Link |\n", + "|---------------------------------------------------------------------------------|--------------------------------------------------------|\n", + "| Berkeley Schema PyPI package
(it's version 11+ of the `nmdc-schema` package) | https://pypi.org/project/nmdc-schema |\n", + "| Berkeley Schema GitHub repository | https://github.com/microbiomedata/berkeley-schema-fy24 |\n", + "| How to `pip install` from a Git branch
instead of PyPI | https://stackoverflow.com/a/20101940 |" ] }, { @@ -161,6 +164,9 @@ "from jsonschema import Draft7Validator as JSONSchemaValidator\n", "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "\n", + "# Note: The migrator module has \"10_2_0\" in its name because, when it was created,\n", + "# the latest legacy schema version was, indeed, still `10.2.0`.\n", "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", @@ -244,6 +250,44 @@ " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" ] }, + { + "cell_type": "markdown", + "id": "1e195db1", + "metadata": {}, + "source": [ + "Delete the \"nmdc\" database from the transformer MongoDB server if that database already exists there (e.g. if it was left over from an experiment).\n", + "\n", + "##### Description\n", + "\n", + "| Description | Link |\n", + "|------------------------------|---------------------------------------------------------------|\n", + "| Python's `subprocess` module | https://docs.python.org/3/library/subprocess.html |\n", + "| `mongosh` CLI options | https://www.mongodb.com/docs/mongodb-shell/reference/options/ |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8939a2ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", + "# because I am expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} \\\n", + " --host='{cfg.transformer_mongo_host}' \\\n", + " --port='{cfg.transformer_mongo_port}' \\\n", + " --username='{cfg.transformer_mongo_username}' \\\n", + " --password='{cfg.transformer_mongo_password}' \\\n", + " --quiet \\\n", + " --eval 'use nmdc' \\\n", + " --eval 'db.dropDatabase()'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" + ] + }, { "cell_type": "markdown", "id": "bc387abc62686091", @@ -283,15 +327,17 @@ "source": [ "### Revoke access from the \"origin\" MongoDB server\n", "\n", - "We revoke \"write\" access so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data.\n", + "We revoke both \"write\" and \"read\" access to the server.\n", + "\n", + "We revoke \"write\" access so people don't make changes to the original data while the migration is happening, given that the migration ends with an overwriting of the original data (which would wipe out any changes made in the meantime).\n", "\n", "We also revoke \"read\" access. The revocation of \"read\" access is technically optional, but (a) the JavaScript mongosh script will be easier for me to maintain if it revokes everything and (b) this prevents people from reading data during the restore step, during which the database may not be self-consistent.\n", "\n", - "References:\n", + "##### References\n", "\n", - "- https://docs.python.org/3/library/subprocess.html\n", - "- https://www.mongodb.com/docs/mongodb-shell/reference/options/\n", - "- https://www.mongodb.com/docs/mongodb-shell/write-scripts/" + "| Description | Link |\n", + "|--------------------------------|-----------------------------------------------------------|\n", + "| Running a script via `mongosh` | https://www.mongodb.com/docs/mongodb-shell/write-scripts/ |" ] }, { @@ -301,13 +347,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", - "# because one of the CLI options contains the Mongo password (since `mongosh` does not support the\n", - "# use of config files located anywhere except in the user's home directory) and my gut tells me\n", - "# this approach makes it less likely that the password appear in some shell history compared to\n", - "# if the command were run via a `!` command (since, to me, the latter more closely resembles\n", - "# regular shell usage).\n", - "#\n", "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", " --host='{cfg.origin_mongo_host}' \\\n", @@ -328,7 +367,9 @@ "source": [ "### Dump collections from the \"origin\" MongoDB server\n", "\n", - "Use `mongodump` to dump all the collections **from** the \"origin\" MongoDB server **into** a local directory." + "Use `mongodump` to dump all the collections **from** the \"origin\" MongoDB server **into** a local directory.\n", + "\n", + "- TODO: Consider only dumping collections represented by the initial schema." ] }, { @@ -451,12 +492,12 @@ "\n", "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `database_slot_names`,\n", "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", - "# other collections without having to wait for that (large) collection to be validated before them.\n", + "# other collections without having to wait for that (large) collection to be validated.\n", "ordered_collection_names = sorted(database_slot_names.copy())\n", "large_collection_name = \"functional_annotation_agg\"\n", "if large_collection_name in ordered_collection_names:\n", " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", - " ordered_collection_names.append(large_collection_name)\n", + " ordered_collection_names.append(large_collection_name) # puts it last\n", "\n", "for collection_name in ordered_collection_names:\n", " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", @@ -563,7 +604,7 @@ "source": [ "### TODO: Drop the original collections from the \"origin\" MongoDB server\n", "\n", - "This is necessary for situations where collections were renamed or deleted. The `--drop` option of `mongorestore` only drops collections that exist in the dump." + "This is necessary for situations where collections were renamed or deleted. (The `--drop` option of `mongorestore` only drops collections that exist in the dump.)" ] }, { @@ -583,9 +624,7 @@ "source": [ "### Load the collections into the \"origin\" MongoDB server\n", "\n", - "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", - "\n", - "- TODO: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." + "Load the transformed collections into the \"origin\" MongoDB server." ] }, { From 4c19cdc38aac6c8d23a3303c08132c9bd932c1a1 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 17 Aug 2024 23:55:14 -0700 Subject: [PATCH 23/27] Drop original collections before restoring from transformed dump --- .../notebooks/migrate_10_5_6_to_11_0_0.ipynb | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb index 93f0e4a1..0ff4290b 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb @@ -487,6 +487,8 @@ "# Make a list of all slots of the `Database` class in the schema.\n", "#\n", "# TODO: Use a SchemaView for this instead of directly accessing the JSON Schema dictionary.\n", + "# Note: Doing so would involve introducing `linkml` as a dependency.\n", + "# Reference: https://linkml.io/linkml/developers/schemaview.html\n", "#\n", "database_slot_names: List[str] = nmdc_jsonschema[\"$defs\"][\"Database\"][\"properties\"]\n", "\n", @@ -602,7 +604,7 @@ "id": "9c253e6f", "metadata": {}, "source": [ - "### TODO: Drop the original collections from the \"origin\" MongoDB server\n", + "### Drop the original collections from the \"origin\" MongoDB server\n", "\n", "This is necessary for situations where collections were renamed or deleted. (The `--drop` option of `mongorestore` only drops collections that exist in the dump.)" ] @@ -614,7 +616,17 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO!" + "shell_command = f\"\"\"\n", + " {cfg.mongosh_path} \\\n", + " --host='{cfg.origin_mongo_host}' \\\n", + " --port='{cfg.origin_mongo_port}' \\\n", + " --username='{cfg.origin_mongo_username}' \\\n", + " --password='{cfg.origin_mongo_password}' \\\n", + " --eval 'use nmdc' \\\n", + " --eval 'db.dropDatabase()'\n", + "\"\"\"\n", + "completed_process = subprocess.run(shell_command, shell=True)\n", + "print(f\"\\nReturn code: {completed_process.returncode}\")" ] }, { From 176c3b9238a602d0f5696e250e8d8c653f9c499d Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 18 Aug 2024 00:16:54 -0700 Subject: [PATCH 24/27] Remove references to no-longer-existing config variables --- .../notebooks/test_helpers.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/demo/metadata_migration/notebooks/test_helpers.py b/demo/metadata_migration/notebooks/test_helpers.py index 761a6f08..f1aa2317 100644 --- a/demo/metadata_migration/notebooks/test_helpers.py +++ b/demo/metadata_migration/notebooks/test_helpers.py @@ -19,7 +19,13 @@ class TestConfig(unittest.TestCase): """ def test_init_method(self): - with TempFile() as notebook_config_file, TempFile() as origin_mongo_config_file, TempFile() as transformer_mongo_config_file, TempFile() as mongodump_binary, TempFile() as mongorestore_binary: + with (TempFile() as notebook_config_file, + TempFile() as origin_mongo_config_file, + TempFile() as transformer_mongo_config_file, + TempFile() as mongodump_binary, + TempFile() as mongorestore_binary, + TempFile() as mongosh_binary): + # Create named temporary directories and get their paths. origin_dump_folder_path = mkdtemp() transformer_dump_folder_path = mkdtemp() @@ -27,6 +33,14 @@ def test_init_method(self): # Populate the Mongo config files, then reset their file pointers. origin_mongo_server_uri = f"mongodb://u:p@origin:12345" transformer_mongo_server_uri = f"mongodb://u:p@transformer:12345" + origin_mongo_host = "origin" + origin_mongo_port = "11111" + origin_mongo_username = "origin_username" + origin_mongo_password = "origin_password" + transformer_mongo_host = "transformer" + transformer_mongo_port = "22222" + transformer_mongo_username = "transformer_username" + transformer_mongo_password = "transformer_password" origin_mongo_yaml = f"uri: {origin_mongo_server_uri}\n" transformer_mongo_yaml = f"uri: {transformer_mongo_server_uri}\n" origin_mongo_config_file.write(origin_mongo_yaml.encode("utf-8")) @@ -37,17 +51,25 @@ def test_init_method(self): # Use familiar aliases in an attempt to facilitate writing the `assert` section below. mongodump_path = mongodump_binary.name mongorestore_path = mongorestore_binary.name + mongosh_path = mongosh_binary.name origin_mongo_config_file_path = origin_mongo_config_file.name transformer_mongo_config_file_path = transformer_mongo_config_file.name # Populate the notebook config file, then reset its file pointer. notebook_config_values = dict( - PATH_TO_ORIGIN_MONGO_CONFIG_FILE=origin_mongo_config_file_path, - PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE=transformer_mongo_config_file_path, PATH_TO_ORIGIN_MONGO_DUMP_FOLDER=origin_dump_folder_path, PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER=transformer_dump_folder_path, PATH_TO_MONGODUMP_BINARY=mongodump_path, PATH_TO_MONGORESTORE_BINARY=mongorestore_path, + PATH_TO_MONGOSH_BINARY=mongosh_path, + ORIGIN_MONGO_HOST=origin_mongo_host, + ORIGIN_MONGO_PORT=origin_mongo_port, + ORIGIN_MONGO_USERNAME=origin_mongo_username, + ORIGIN_MONGO_PASSWORD=origin_mongo_password, + TRANSFORMER_MONGO_HOST=transformer_mongo_host, + TRANSFORMER_MONGO_PORT=transformer_mongo_port, + TRANSFORMER_MONGO_USERNAME=transformer_mongo_username, + TRANSFORMER_MONGO_PASSWORD=transformer_mongo_password, ) for key, value in notebook_config_values.items(): notebook_config_file.write(f"{key} = {value}\n".encode("utf-8")) @@ -61,13 +83,14 @@ def test_init_method(self): assert cfg.mongorestore_path == mongorestore_path assert cfg.origin_dump_folder_path == origin_dump_folder_path assert cfg.transformer_dump_folder_path == transformer_dump_folder_path - assert cfg.origin_mongo_config_file_path == origin_mongo_config_file_path - assert ( - cfg.transformer_mongo_config_file_path - == transformer_mongo_config_file_path - ) - assert cfg.origin_mongo_server_uri == origin_mongo_server_uri - assert cfg.transformer_mongo_server_uri == transformer_mongo_server_uri + assert cfg.origin_mongo_host == origin_mongo_host + assert cfg.origin_mongo_port == origin_mongo_port + assert cfg.origin_mongo_username == origin_mongo_username + assert cfg.origin_mongo_password == origin_mongo_password + assert cfg.transformer_mongo_host == transformer_mongo_host + assert cfg.transformer_mongo_port == transformer_mongo_port + assert cfg.transformer_mongo_username == transformer_mongo_username + assert cfg.transformer_mongo_password == transformer_mongo_password # Delete the temporary directories (i.e. clean up). shutil.rmtree(origin_dump_folder_path) From 98e64791f4e38d3fa4940f031731e70a7fd9d89b Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 26 Aug 2024 21:03:03 -0700 Subject: [PATCH 25/27] Update version number of initial schema --- ...0.ipynb => migrate_10_8_0_to_11_0_0.ipynb} | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) rename demo/metadata_migration/notebooks/{migrate_10_5_6_to_11_0_0.ipynb => migrate_10_8_0_to_11_0_0.ipynb} (97%) diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb similarity index 97% rename from demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb rename to demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb index 0ff4290b..43dca9f9 100644 --- a/demo/metadata_migration/notebooks/migrate_10_5_6_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb @@ -6,9 +6,7 @@ "metadata": { "collapsed": true }, - "source": [ - "# Migrate MongoDB database from `nmdc-schema` `v10.5.6` to `v11.0.0`" - ] + "source": "# Migrate MongoDB database from `nmdc-schema` `v10.8.0` to `v11.0.0`" }, { "cell_type": "markdown", @@ -17,7 +15,7 @@ "source": [ "## Introduction\n", "\n", - "This notebook will be used to migrate the database from `nmdc-schema` `v10.5.6` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v10.5.6) June 25, 2024) to `v11.0.0` (i.e. the initial version of the so-called \"Berkeley schema\").\n", + "This notebook will be used to migrate the database from `nmdc-schema` `v10.8.0` ([released](https://github.com/microbiomedata/nmdc-schema/releases/tag/v10.8.0) August 21, 2024) to `v11.0.0` (i.e. the initial version of the so-called \"Berkeley schema\").\n", "\n", "Unlike previous migrators, this one does not pick and choose which collections it will dump. There are two reasons for this: (1) migrators no longer have a dedicated `self.agenda` dictionary that indicates all the collections involved in the migration; and (2) this migration is the first one that involves creating, renaming, and dropping any collections; none of which are things that the old `self.agenda`-based system was designed to handle. So, instead of picking and choosing collections, this migrator **dumps them all.**" ] @@ -133,7 +131,7 @@ "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", - "%pip install nmdc-schema==11.0.0rc16" + "%pip install nmdc-schema==11.0.0rc20" ] }, { @@ -273,7 +271,7 @@ "outputs": [], "source": [ "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", - "# because I am expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", + "# because I expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", " --host='{cfg.transformer_mongo_host}' \\\n", @@ -362,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "fd4994a0", + "id": "b7799910b6b0715d", "metadata": {}, "source": [ "### Dump collections from the \"origin\" MongoDB server\n", @@ -375,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf8fa1ca", + "id": "da530d6754c4f6fe", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +395,7 @@ }, { "cell_type": "markdown", - "id": "fd4994a0", + "id": "932ebde8abdd70ec", "metadata": {}, "source": [ "### Load the dumped collections into the \"transformer\" MongoDB server\n", @@ -408,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf8fa1ca", + "id": "79bd888e82d52a93", "metadata": {}, "outputs": [], "source": [ @@ -447,7 +445,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05869340", + "id": "9c89c9dd3afe64e2", "metadata": {}, "outputs": [], "source": [ @@ -480,7 +478,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05869340", + "id": "e1c50b9911e02e70", "metadata": {}, "outputs": [], "source": [ From 9284c2b994f71aed5fda58f6d3be4fc75106db23 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 26 Aug 2024 21:05:21 -0700 Subject: [PATCH 26/27] Create "no op" migration notebook from `10.5.6` to `10.8.0` --- .../notebooks/migrate_10_5_6_to_10_8_0.ipynb | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 demo/metadata_migration/notebooks/migrate_10_5_6_to_10_8_0.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_5_6_to_10_8_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_5_6_to_10_8_0.ipynb new file mode 100644 index 00000000..ceb9b9a3 --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_10_5_6_to_10_8_0.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "# Migrate MongoDB database from `nmdc-schema` `v10.5.6` to `v10.8.0`", + "id": "d05efc6327778f9c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "There are no migrators associated with any schema changes between schema versions `v10.5.6` and `v10.8.0`. So, this notebook is a \"no op\" (i.e. \"no operation\").", + "id": "b99d5924e825b9a2" + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "jupyter": { + "is_executing": true + } + }, + "source": "# no op", + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e832cf15de5292d4d1116e37fa32749348174b91 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 26 Aug 2024 21:25:11 -0700 Subject: [PATCH 27/27] Get collection names via `SchemaView` (instead of traversing JSON Schema) --- demo/metadata_migration/notebooks/helpers.py | 29 +++- .../notebooks/migrate_10_8_0_to_11_0_0.ipynb | 158 ++++++++++-------- .../notebooks/requirements.txt | 3 +- 3 files changed, 117 insertions(+), 73 deletions(-) diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index af0774b4..4865ac3b 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -1,9 +1,13 @@ from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, List import logging from datetime import datetime from dotenv import dotenv_values +from linkml_runtime import SchemaView + + +DATABASE_CLASS_NAME = "Database" class Config: @@ -117,3 +121,26 @@ def setup_logger( logger.handlers.clear() # avoids duplicate log entries logger.addHandler(file_handler) return logger + + +def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]: + """ + Returns the names of the slots of the `Database` class that describe database collections. + + :param schema_view: A `SchemaView` instance + """ + collection_names = [] + + for slot_name in schema_view.class_slots(DATABASE_CLASS_NAME): + slot_definition = schema_view.induced_slot(slot_name, DATABASE_CLASS_NAME) + + # Filter out any hypothetical (future) slots that don't correspond to a collection (e.g. `db_version`). + if slot_definition.multivalued and slot_definition.inlined_as_list: + collection_names.append(slot_name) + + # Filter out duplicate names. This is to work around the following issues in the schema: + # - https://github.com/microbiomedata/nmdc-schema/issues/1954 + # - https://github.com/microbiomedata/nmdc-schema/issues/1955 + collection_names = list(set(collection_names)) + + return collection_names diff --git a/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb index 43dca9f9..e0a45cdb 100644 --- a/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_8_0_to_11_0_0.ipynb @@ -53,13 +53,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "8aee55e3", "metadata": {}, - "outputs": [], "source": [ "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -72,14 +72,14 @@ }, { "cell_type": "code", - "execution_count": null, "id": "c70b6715", "metadata": {}, - "outputs": [], "source": [ "!rm -rf {cfg.origin_dump_folder_path}\n", "!rm -rf {cfg.transformer_dump_folder_path}" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -122,17 +122,17 @@ }, { "cell_type": "code", - "execution_count": null, "id": "e25a0af308c3185b", "metadata": { "collapsed": false }, - "outputs": [], "source": [ "%pip install --upgrade pip\n", "%pip install -r requirements.txt\n", "%pip install nmdc-schema==11.0.0rc20" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -148,10 +148,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "dbecd561", "metadata": {}, - "outputs": [], "source": [ "# Standard library packages:\n", "import subprocess\n", @@ -160,17 +158,20 @@ "# Third-party packages:\n", "import pymongo\n", "from jsonschema import Draft7Validator as JSONSchemaValidator\n", - "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier, get_nmdc_jsonschema\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "from linkml_runtime import SchemaView\n", "\n", "# Note: The migrator module has \"10_2_0\" in its name because, when it was created,\n", "# the latest legacy schema version was, indeed, still `10.2.0`.\n", "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", - "from helpers import Config, setup_logger\n", + "from helpers import Config, setup_logger, get_collection_names_from_schema\n", "from bookkeeper import Bookkeeper, MigrationEvent" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -184,10 +185,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "1eac645a", "metadata": {}, - "outputs": [], "source": [ "cfg = Config()\n", "\n", @@ -200,7 +199,9 @@ "!{mongodump} --version\n", "!{mongorestore} --version\n", "!{mongosh} --version" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -214,10 +215,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "8e95f559", "metadata": {}, - "outputs": [], "source": [ "# Mongo client for \"origin\" MongoDB server.\n", "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_host, \n", @@ -246,7 +245,9 @@ "\n", " # Sanity test: Ensure the transformation database does not exist.\n", " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -265,10 +266,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "8939a2ed", "metadata": {}, - "outputs": [], "source": [ "# Note: I run this command via Python's `subprocess` module instead of via an IPython magic `!` command\n", "# because I expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", @@ -284,7 +283,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -300,12 +301,10 @@ }, { "cell_type": "code", - "execution_count": null, "id": "5c982eb0c04e606d", "metadata": { "collapsed": false }, - "outputs": [], "source": [ "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict(variant=SchemaVariantIdentifier.nmdc_materialized_patterns)\n", "nmdc_jsonschema_validator = JSONSchemaValidator(nmdc_jsonschema)\n", @@ -316,7 +315,29 @@ "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", "\n", "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" - ] + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Create SchemaView\n", + "\n", + "In this step, you'll instantiate a `SchemaView` that is bound to the destination schema. \n", + "\n", + "- Reference: https://linkml.io/linkml/developers/schemaview.html" + ], + "id": "e7e8befb362a1670" + }, + { + "metadata": {}, + "cell_type": "code", + "source": "schema_view = SchemaView(get_nmdc_jsonschema())", + "id": "625a6e7df5016677", + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -340,10 +361,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "f761caad", "metadata": {}, - "outputs": [], "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", @@ -356,7 +375,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -372,10 +393,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "da530d6754c4f6fe", "metadata": {}, - "outputs": [], "source": [ "# Dump all collections from the \"origin\" database.\n", "shell_command = f\"\"\"\n", @@ -391,7 +410,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -405,10 +426,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "79bd888e82d52a93", "metadata": {}, - "outputs": [], "source": [ "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", "shell_command = f\"\"\"\n", @@ -426,7 +445,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -444,10 +465,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "9c89c9dd3afe64e2", "metadata": {}, - "outputs": [], "source": [ "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", "adapter = MongoAdapter(\n", @@ -463,7 +482,9 @@ "\n", "# Execute the Migrator's `upgrade` method to perform the migration.\n", "migrator.upgrade()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -477,23 +498,16 @@ }, { "cell_type": "code", - "execution_count": null, "id": "e1c50b9911e02e70", "metadata": {}, - "outputs": [], "source": [ - "# Make a list of all slots of the `Database` class in the schema.\n", - "#\n", - "# TODO: Use a SchemaView for this instead of directly accessing the JSON Schema dictionary.\n", - "# Note: Doing so would involve introducing `linkml` as a dependency.\n", - "# Reference: https://linkml.io/linkml/developers/schemaview.html\n", - "#\n", - "database_slot_names: List[str] = nmdc_jsonschema[\"$defs\"][\"Database\"][\"properties\"]\n", + "# Get the names of all collections.\n", + "collection_names: List[str] = get_collection_names_from_schema(schema_view)\n", "\n", - "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `database_slot_names`,\n", + "# Ensure that, if the (large) \"functional_annotation_agg\" collection is present in `collection_names`,\n", "# it goes at the end of the list we process. That way, we can find out about validation errors in\n", "# other collections without having to wait for that (large) collection to be validated.\n", - "ordered_collection_names = sorted(database_slot_names.copy())\n", + "ordered_collection_names = sorted(collection_names.copy())\n", "large_collection_name = \"functional_annotation_agg\"\n", "if large_collection_name in ordered_collection_names:\n", " ordered_collection_names = list(filter(lambda n: n != large_collection_name, ordered_collection_names))\n", @@ -520,7 +534,9 @@ " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -534,10 +550,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "db6e432d", "metadata": {}, - "outputs": [], "source": [ "# Dump the database from the \"transformer\" MongoDB server.\n", "shell_command = f\"\"\"\n", @@ -553,7 +567,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\") " - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -569,13 +585,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "dbbe706d", "metadata": {}, - "outputs": [], "source": [ "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -589,13 +605,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "ca49f61a", "metadata": {}, - "outputs": [], "source": [ "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -609,10 +625,8 @@ }, { "cell_type": "code", - "execution_count": 1, "id": "0b26e434", "metadata": {}, - "outputs": [], "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", @@ -625,7 +639,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -639,10 +655,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "1dfbcf0a", "metadata": {}, - "outputs": [], "source": [ "# Load the transformed collections into the origin server, replacing any same-named ones that are there.\n", "shell_command = f\"\"\"\n", @@ -661,7 +675,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\") " - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -677,15 +693,15 @@ }, { "cell_type": "code", - "execution_count": null, "id": "d1eaa6c92789c4f3", "metadata": { "collapsed": false }, - "outputs": [], "source": [ "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -699,10 +715,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "9aab3c7e", "metadata": {}, - "outputs": [], "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} \\\n", @@ -715,7 +729,9 @@ "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" - ] + ], + "outputs": [], + "execution_count": null } ], "metadata": { diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 0096244c..63794fcc 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -2,4 +2,5 @@ dictdiffer==0.9.0 jsonschema==4.19.2 pymongo==4.7.2 python-dotenv==1.0.0 -PyYAML==6.0.1 \ No newline at end of file +PyYAML==6.0.1 +linkml-runtime==1.8.2 \ No newline at end of file