From 2a2bd20f4ba6f47f82c8ec09d3f708436c70f3f0 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 28 Nov 2023 21:25:45 -0800
Subject: [PATCH 1/3] Implement (no-op) notebook for nmdc-schema 9.0.4 to 9.1.0
 migration

---
 .../notebooks/migrate_9_0_4_to_9_1_0.ipynb    | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb

diff --git a/demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb b/demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb
new file mode 100644
index 00000000..c7e2112a
--- /dev/null
+++ b/demo/metadata_migration/notebooks/migrate_9_0_4_to_9_1_0.ipynb
@@ -0,0 +1,54 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Migrate Mongo data from `nmdc-schema` `v9.0.4` to `v9.1.0`\n",
+    "\n",
+    "This migration is a [no-op](https://en.wikipedia.org/wiki/NOP_(code)) (nothing to do)."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "37424e9ceccffe70"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "# Do nothing.\n",
+    "pass"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-29T05:19:55.043018Z",
+     "start_time": "2023-11-29T05:19:55.039907Z"
+    }
+   },
+   "id": "3c27410cfb3021aa"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 035993cdbb78404d11fff7b414405de0d1e18480 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Tue, 28 Nov 2023 21:26:54 -0800
Subject: [PATCH 2/3] Implement Python notebook for nmdc-schema 9.1.0 to 9.2.0
 migration

---
 .../notebooks/migrate_9_1_0_to_9_2_0.ipynb    | 588 ++++++++++++++++++
 1 file changed, 588 insertions(+)
 create mode 100644 demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb

diff --git a/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb b/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb
new file mode 100644
index 00000000..3521ca44
--- /dev/null
+++ b/demo/metadata_migration/notebooks/migrate_9_1_0_to_9_2_0.ipynb
@@ -0,0 +1,588 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Migrate Mongo data from `nmdc-schema` `v9.1.0` to `v9.2.0`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "### 1. Determine Mongo collections that will be transformed\n",
+    "\n",
+    "In this step, you will determine which Mongo collections will be transformed during this migration.\n",
+    "\n",
+    "1. In the `nmdc-schema` repo, in the `nmdc_schema/migrators` directory, locate the Python module whose name reflects the initial and final version numbers of this migration.\n",
+    "2. In the Python class defined within that module, locate the `self.agenda` dictionary.\n",
+    "3. In that dictionary, make a list of the keys—these are the names of the Mongo collections that will be transformed during this migration. For example:\n",
+    "   ```py\n",
+    "   self.agenda = dict(\n",
+    "      collection_name_1=[self.some_function],\n",
+    "      collection_name_2=[self.some_function],\n",
+    "   )\n",
+    "   ```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Coordinate with teammates that read/write to those collections\n",
+    "\n",
+    "In this step, you'll identify and reach out to the people that read/write to those collections; to agree on a migration schedule that works for you and them.\n",
+    "\n",
+    "Here's a table of Mongo collections and the components of the NMDC system that write to them (according to [a conversation that occurred on September 11, 2023](https://nmdc-group.slack.com/archives/C01SVTKM8GK/p1694465755802979?thread_ts=1694216327.234519&cid=C01SVTKM8GK)).\n",
+    "\n",
+    "| Mongo collection                            | NMDC system components that write to it                  |\n",
+    "|---------------------------------------------|----------------------------------------------------------|\n",
+    "| `biosample_set`                             | Workflows (via manual entry via `nmdc-runtime` HTTP API) |\n",
+    "| `data_object_set`                           | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `mags_activity_set`                         | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `metagenome_annotation_activity_set`        | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `metagenome_assembly_set`                   | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `read_based_taxonomy_analysis_activity_set` | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `read_qc_analysis_activity_set`             | Workflows (via `nmdc-runtime` HTTP API)                  |\n",
+    "| `jobs`                                      | Scheduler (via Mongo directly)                           |\n",
+    "| `*`                                         | `nmdc-runtime` (via Mongo directly)                      |\n",
+    "\n",
+    "You can use that table to help determine which people read/write to those collections. You can then coordinate a migration time slot with them via Slack, email, etc."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Setup a migration environment\n",
+    "\n",
+    "In this step, you'll set up an environment in which you can run this notebook.\n",
+    "\n",
+    "1. Start a **Mongo server** on your local machine (and ensure it does **not** contain a database named `nmdc`).\n",
+    "    1. You can start a temporary, [Docker](https://hub.docker.com/_/mongo)-based Mongo server at `localhost:27055` by running this command:\n",
+    "       ```shell\n",
+    "       # Run in any directory:\n",
+    "       docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo\n",
+    "       ```\n",
+    "       > Note: A Mongo server started via that command will have no access control (i.e. you will be able to access it without a username or password).\n",
+    "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n",
+    "    1. You can use the `.notebook.env.example` file as a template:\n",
+    "       ```shell\n",
+    "       # Run in the same directory as this notebook:\n",
+    "       $ cp .notebook.env.example .notebook.env\n",
+    "       ```\n",
+    "3. Create and populate **Mongo configuration files** for connecting to the origin and transformer Mongo servers.\n",
+    "    1. You can use the `.mongo.yaml.example` file as a template:\n",
+    "       ```shell\n",
+    "       # Run in the same directory as this notebook:\n",
+    "       $ cp .mongo.yaml.example .mongo.origin.yaml\n",
+    "       $ cp .mongo.yaml.example .mongo.transformer.yaml\n",
+    "       ```\n",
+    "       > When populating the file for the origin Mongo server, use credentials that have write access to the `nmdc` database."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Procedure"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Install Python dependencies\n",
+    "\n",
+    "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends. You can do that by running this cell.\n",
+    "\n",
+    "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "%pip install nmdc-schema==9.2.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import Python dependencies\n",
+    "\n",
+    "Import the Python objects upon which this notebook depends.\n",
+    "\n",
+    "> Note: One of the Python objects is a Python class that is specific to this migration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Standard library packages:\n",
+    "from pathlib import Path\n",
+    "from shutil import rmtree\n",
+    "from copy import deepcopy\n",
+    "\n",
+    "# Third-party packages:\n",
+    "import pymongo\n",
+    "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n",
+    "from nmdc_schema.migrators.migrator_from_9_1_to_9_2 import Migrator_from_9_1_to_9_2 as Migrator\n",
+    "from jsonschema import Draft7Validator\n",
+    "from dictdiffer import diff\n",
+    "\n",
+    "# First-party packages:\n",
+    "from helpers import Config"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Programmatically determine which collections will be transformed\n",
+    "\n",
+    "Here are the names of the collections this migration will transform.\n",
+    "\n",
+    "> Ensure you have coordinated with the people that read/write to them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agenda_collection_names = Migrator().agenda.keys()\n",
+    "\n",
+    "print(\"The following collections will be transformed:\")\n",
+    "print(\"\\n\".join(agenda_collection_names))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parse configuration files\n",
+    "\n",
+    "Parse the notebook and Mongo configuration files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cfg = Config()\n",
+    "\n",
+    "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n",
+    "mongodump = cfg.mongodump_path\n",
+    "mongorestore = cfg.mongorestore_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform a sanity test of the application paths."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!{mongodump} --version\n",
+    "!{mongorestore} --version"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create Mongo clients\n",
+    "\n",
+    "Create Mongo clients you can use to access the \"origin\" Mongo server (i.e. the one containing the database you want to migrate) and the \"transformer\" Mongo server (i.e. the one you want to use to perform the data transformations)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mongo client for origin Mongo server.\n",
+    "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n",
+    "\n",
+    "# Mongo client for transformer Mongo server.\n",
+    "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform a sanity test of the Mongo clients' abilities to access their respective Mongo servers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Display the Mongo server version (running on the \"origin\" Mongo server).\n",
+    "print(\"Origin Mongo server version:      \" + origin_mongo_client.server_info()[\"version\"])\n",
+    "\n",
+    "# Sanity test: Ensure the origin database exists.\n",
+    "assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n",
+    "\n",
+    "# Display the Mongo server version (running on the \"transformer\" Mongo server).\n",
+    "print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n",
+    "\n",
+    "# Sanity test: Ensure the transformation database does not exist.\n",
+    "assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create JSON Schema validator\n",
+    "\n",
+    "In this step, you'll create a JSON Schema validator for the NMDC Schema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict()\n",
+    "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n",
+    "\n",
+    "> Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"NMDC Schema title:   \" + nmdc_jsonschema[\"title\"])\n",
+    "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n",
+    "\n",
+    "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema)  # raises exception if schema is invalid"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dump collections from the \"origin\" Mongo server\n",
+    "\n",
+    "In this step, you'll use `mongodump` to dump the collections that will be transformed during this migration; from the \"origin\" Mongo server.\n",
+    "\n",
+    "Since `mongodump` doesn't provide a CLI option that you can use to specify the collections you _want_ it to dump (unless that is only one collection), you can use a different CLI option to tell it all the collection you do _not_ want it to dump. The end result will be the same—there's just an extra step involved."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That extra step is to generate an `--excludeCollection=\"{name}\"` CLI option for each collection that is not on the agenda, which you'll do now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n",
+    "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n",
+    "non_agenda_collection_names = [name for name in all_collection_names if name not in agenda_collection_names]\n",
+    "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n",
+    "exclusion_options_str = \" \".join(exclusion_options)  # separates each option with a space\n",
+    "\n",
+    "print(exclusion_options_str)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, you'll run a `mongodump` command containing all those `--excludeCollection=\"{name}\"` CLI options."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dump the not-excluded collections from the origin database.\n",
+    "!{mongodump} \\\n",
+    "  --config=\"{cfg.origin_mongo_config_file_path}\" \\\n",
+    "  --db=\"nmdc\" \\\n",
+    "  --gzip \\\n",
+    "  --out=\"{cfg.origin_dump_folder_path}\" \\\n",
+    "  {exclusion_options_str}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the collections into the \"transformer\" Mongo server\n",
+    "\n",
+    "In this step, you'll load the collections dumped from the \"origin\" Mongo server, into the \"transformer\" MongoDB server.\n",
+    "\n",
+    "Since it's possible that the dump includes more collections than are on the agenda (due to someone creating a collection between the time you generated the exclusion list and the time you ran `mongodump`), you will use one or more of `mongorestore`'s `--nsInclude` CLI options to indicate which collections you want to load.\n",
+    "\n",
+    "Here's where you will generate the `--nsInclude=\"nmdc.{name}\"` CLI options."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in agenda_collection_names]\n",
+    "inclusion_options_str = \" \".join(inclusion_options)  # separates each option with a space\n",
+    "\n",
+    "print(inclusion_options_str)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, you'll run a `mongorestore` command containing all those `--nsInclude=\"nmdc.{name}\"` CLI options."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Restore the dumped collections to the transformer MongoDB server.\n",
+    "!{mongorestore} \\\n",
+    "  --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n",
+    "  --gzip \\\n",
+    "  --drop \\\n",
+    "  --preserveUUID \\\n",
+    "  --dir=\"{cfg.origin_dump_folder_path}\" \\\n",
+    "  {inclusion_options_str}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Transform the collections within the \"transformer\" Mongo server\n",
+    "\n",
+    "Now that the transformer database contains a copy of each collection on the agenda, you can transform those copies.\n",
+    "\n",
+    "The transformation functions are provided by the `nmdc-schema` Python package.\n",
+    "> You can examine the transformation functions at: https://github.com/microbiomedata/nmdc-schema/blob/main/nmdc_schema/migration_recursion.py\n",
+    "\n",
+    "In this step, you will retrieve each documents from each collection on the agenda, pass it to the associated transformation function(s) on the agenda, then store the transformed document in place of the original one—all within the \"transformation\" database only. **The \"origin\" database is not involved with this step.**\n",
+    "\n",
+    "> Note: This step also includes validation. Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py\n",
+    "\n",
+    "> Note: This step also include a before-and-after comparison to facilitate manual spot checks. References: https://docs.python.org/3/library/copy.html#copy.deepcopy and https://dictdiffer.readthedocs.io/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "migrator = Migrator()\n",
+    "\n",
+    "# Apply the transformations.\n",
+    "for collection_name, transformation_pipeline in migrator.agenda.items():\n",
+    "    print(f\"Transforming documents in collection: {collection_name}\")\n",
+    "    transformed_documents = []\n",
+    "\n",
+    "    # Get each document from this collection.\n",
+    "    collection = transformer_mongo_client[\"nmdc\"][collection_name]\n",
+    "    for original_document in collection.find():\n",
+    "        # Make a deep copy of the original document, to enable before-and-after comparison.\n",
+    "        print(original_document)\n",
+    "        copy_of_original_document = deepcopy(original_document)\n",
+    "        \n",
+    "        # Put the document through the transformation pipeline associated with this collection.\n",
+    "        transformed_document = original_document  # initializes the variable\n",
+    "        for transformation_function in transformation_pipeline:\n",
+    "            transformed_document = transformation_function(transformed_document)\n",
+    "        print(transformed_document)\n",
+    "        \n",
+    "        # Compare the transformed document with a copy of the original document;\n",
+    "        # and, if there are any differences, print those differences.\n",
+    "        difference = diff(copy_of_original_document, transformed_document)\n",
+    "        differences = list(difference)\n",
+    "        if len(differences) > 0:\n",
+    "            print(f\"✏️ {differences}\")\n",
+    "\n",
+    "        # Validate the transformed document.\n",
+    "        #\n",
+    "        # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n",
+    "        #\n",
+    "        # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n",
+    "        #       the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n",
+    "        #       containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n",
+    "        #       copy (i.e. a shallow copy) of the document that lacks that specific key.\n",
+    "        #\n",
+    "        # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n",
+    "        #       Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n",
+    "        #\n",
+    "        transformed_document_without_underscore_id_key = {key: value for key, value in transformed_document.items() if key != \"_id\"}\n",
+    "        root_to_validate = dict([(collection_name, [transformed_document_without_underscore_id_key])])\n",
+    "        nmdc_jsonschema_validator.validate(root_to_validate)  # raises exception if invalid\n",
+    "\n",
+    "        # Store the transformed document.\n",
+    "        transformed_documents.append(transformed_document)    \n",
+    "        print(\"\")    \n",
+    "\n",
+    "    # Replace the original documents with the transformed versions of themselves (in the transformer database).\n",
+    "    for transformed_document in transformed_documents:\n",
+    "        collection.replace_one({\"id\": {\"$eq\": transformed_document[\"id\"]}}, transformed_document)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dump the transformed collections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dump the database from the transformer MongoDB server.\n",
+    "!{mongodump} \\\n",
+    "  --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n",
+    "  --db=\"nmdc\" \\\n",
+    "  --gzip \\\n",
+    "  --out=\"{cfg.transformer_dump_folder_path}\" \\\n",
+    "  {exclusion_options_str}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the transformed data into the \"origin\" Mongo server\n",
+    "\n",
+    "In this step, you'll put the transformed collection(s) into the origin MongoDB server, replacing the original collection(s) that have the same name(s)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n",
+    "!{mongorestore} \\\n",
+    "  --config=\"{cfg.origin_mongo_config_file_path}\" \\\n",
+    "  --gzip \\\n",
+    "  --verbose \\\n",
+    "  --dir=\"{cfg.transformer_dump_folder_path}\" \\\n",
+    "  --drop \\\n",
+    "  --preserveUUID \\\n",
+    "  {inclusion_options_str}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### (Optional) Clean up\n",
+    "\n",
+    "Delete the temporary files and MongoDB dumps created by this notebook.\n",
+    "\n",
+    "> Note: You can skip this step, in case you want to delete them manually later (e.g. to examine them before deleting them)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "paths_to_files_to_delete = []\n",
+    "\n",
+    "paths_to_folders_to_delete = [\n",
+    "    cfg.origin_dump_folder_path,\n",
+    "    cfg.transformer_dump_folder_path,\n",
+    "]\n",
+    "\n",
+    "# Delete files.\n",
+    "for path in [Path(string) for string in paths_to_files_to_delete]:\n",
+    "    try:\n",
+    "        path.unlink()\n",
+    "        print(f\"Deleted: {path}\")\n",
+    "    except:\n",
+    "        print(f\"Failed to delete: {path}\")\n",
+    "\n",
+    "# Delete folders.\n",
+    "for path in [Path(string) for string in paths_to_folders_to_delete]:\n",
+    "    try:\n",
+    "        rmtree(path)\n",
+    "        print(f\"Deleted: {path}\")\n",
+    "    except:\n",
+    "        print(f\"Failed to delete: {path}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 1e2bf1cbbafefa1a7e670267fdbd46aa50c0cd3f Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 29 Nov 2023 13:46:14 -0800
Subject: [PATCH 3/3] Update `nmdc-schema` package from `9.1.0` to `9.2.0`

---
 requirements/dev.txt  |  46 ++++++++---
 requirements/main.in  |   2 +-
 requirements/main.txt | 179 ++++++++++++++++++++++++++----------------
 3 files changed, 144 insertions(+), 83 deletions(-)

diff --git a/requirements/dev.txt b/requirements/dev.txt
index b87ef35a..aca7b8cb 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile --allow-unsafe --output-file=requirements/dev.txt --strip-extras requirements/dev.in
@@ -15,14 +15,18 @@ black==23.11.0
     #   -r requirements/dev.in
 build==1.0.3
     # via pip-tools
-cattrs==23.1.2
+cattrs==23.2.2
     # via
     #   -c requirements/main.txt
     #   requests-cache
-certifi==2023.7.22
+certifi==2023.11.17
     # via
     #   -c requirements/main.txt
     #   requests
+cffi==1.16.0
+    # via
+    #   -c requirements/main.txt
+    #   cryptography
 charset-normalizer==3.3.2
     # via
     #   -c requirements/main.txt
@@ -36,19 +40,27 @@ coverage==7.3.2
     # via
     #   -r requirements/dev.in
     #   pytest-cov
+cryptography==41.0.7
+    # via
+    #   -c requirements/main.txt
+    #   secretstorage
 docutils==0.20.1
     # via
     #   -c requirements/main.txt
     #   readme-renderer
+exceptiongroup==1.2.0
+    # via
+    #   -c requirements/main.txt
+    #   cattrs
+    #   pytest
 flake8==6.1.0
     # via -r requirements/dev.in
-idna==3.4
+idna==3.6
     # via
     #   -c requirements/main.txt
     #   requests
 importlib-metadata==6.8.0
     # via
-    #   -c requirements/main.txt
     #   keyring
     #   twine
 iniconfig==2.0.0
@@ -59,6 +71,10 @@ invoke==2.2.0
     # via -r requirements/dev.in
 jaraco-classes==3.3.0
     # via keyring
+jeepney==0.8.0
+    # via
+    #   keyring
+    #   secretstorage
 keyring==24.3.0
     # via twine
 markdown-it-py==3.0.0
@@ -104,11 +120,15 @@ pluggy==1.3.0
     #   pytest
 pycodestyle==2.11.1
     # via flake8
+pycparser==2.21
+    # via
+    #   -c requirements/main.txt
+    #   cffi
 pyflakes==3.1.0
     # via
     #   -c requirements/main.txt
     #   flake8
-pygments==2.16.1
+pygments==2.17.2
     # via
     #   -c requirements/main.txt
     #   readme-renderer
@@ -134,7 +154,7 @@ requests==2.31.0
     #   requests-mock
     #   requests-toolbelt
     #   twine
-requests-cache==1.1.0
+requests-cache==1.1.1
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
@@ -146,8 +166,10 @@ requests-toolbelt==0.10.1
     #   twine
 rfc3986==2.0.0
     # via twine
-rich==13.6.0
+rich==13.7.0
     # via twine
+secretstorage==3.3.3
+    # via keyring
 six==1.16.0
     # via
     #   -c requirements/main.txt
@@ -179,19 +201,17 @@ urllib3==1.26.18
     #   requests
     #   requests-cache
     #   twine
-wheel==0.41.3
+wheel==0.42.0
     # via pip-tools
 zipp==3.17.0
-    # via
-    #   -c requirements/main.txt
-    #   importlib-metadata
+    # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
 pip==23.3.1
     # via
     #   -r requirements/dev.in
     #   pip-tools
-setuptools==68.2.2
+setuptools==69.0.2
     # via
     #   -c requirements/main.txt
     #   -r requirements/dev.in
diff --git a/requirements/main.in b/requirements/main.in
index c5c46de0..7d4db727 100644
--- a/requirements/main.in
+++ b/requirements/main.in
@@ -24,7 +24,7 @@ mkdocs-jupyter
 mkdocs-material
 mkdocs-mermaid2-plugin
 motor
-nmdc-schema==9.1.0
+nmdc-schema==9.2.0
 openpyxl
 pandas
 passlib[bcrypt]
diff --git a/requirements/main.txt b/requirements/main.txt
index c9e99118..852180fb 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile --allow-unsafe --output-file=requirements/main.txt --strip-extras requirements/main.in
@@ -23,10 +23,6 @@ anyio==3.7.1
     #   jupyter-server
     #   starlette
     #   watchfiles
-appnope==0.1.3
-    # via
-    #   ipykernel
-    #   ipython
 argon2-cffi==23.1.0
     # via jupyter-server
 argon2-cffi-bindings==21.2.0
@@ -54,7 +50,7 @@ backoff==2.2.1
     # via gql
 base32-lib==1.0.2
     # via -r requirements/main.in
-bcrypt==4.0.1
+bcrypt==4.1.1
     # via passlib
 beanie==1.23.6
     # via -r requirements/main.in
@@ -67,15 +63,17 @@ black==23.11.0
     # via shed
 bleach==6.1.0
     # via nbconvert
-boto3==1.29.0
+boto3==1.33.3
     # via -r requirements/main.in
-botocore==1.32.0
+botocore==1.33.3
     # via
     #   boto3
     #   s3transfer
-cattrs==23.1.2
+cachetools==5.3.2
+    # via tox
+cattrs==23.2.2
     # via requests-cache
-certifi==2023.7.22
+certifi==2023.11.17
     # via requests
 cffi==1.16.0
     # via
@@ -87,6 +85,7 @@ chardet==5.2.0
     # via
     #   pyshex
     #   pyshexc
+    #   tox
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
@@ -108,7 +107,9 @@ click==8.1.7
 click-log==0.4.0
     # via nmdc-schema
 colorama==0.4.6
-    # via mkdocs-material
+    # via
+    #   mkdocs-material
+    #   tox
 coloredlogs==14.0
     # via dagster
 com2ann==0.3.0
@@ -119,27 +120,29 @@ comm==0.2.0
     #   ipywidgets
 croniter==2.0.1
     # via dagster
-cryptography==41.0.5
+cryptography==41.0.7
     # via python-jose
 curies==0.7.4
-    # via linkml-runtime
-dagit==1.5.7
+    # via
+    #   linkml-runtime
+    #   prefixmaps
+dagit==1.5.9
     # via -r requirements/main.in
-dagster==1.5.7
+dagster==1.5.9
     # via
     #   -r requirements/main.in
     #   dagster-graphql
     #   dagster-postgres
     #   dagster-webserver
-dagster-graphql==1.5.7
+dagster-graphql==1.5.9
     # via
     #   -r requirements/main.in
     #   dagster-webserver
-dagster-pipes==1.5.7
+dagster-pipes==1.5.9
     # via dagster
-dagster-postgres==0.21.7
+dagster-postgres==0.21.9
     # via -r requirements/main.in
-dagster-webserver==1.5.7
+dagster-webserver==1.5.9
     # via dagit
 debugpy==1.8.0
     # via ipykernel
@@ -151,6 +154,8 @@ dependency-injector==4.41.0
     # via -r requirements/main.in
 deprecated==1.2.14
     # via linkml-runtime
+distlib==0.3.7
+    # via virtualenv
 dnspython==2.4.2
     # via
     #   email-validator
@@ -169,6 +174,12 @@ email-validator==2.1.0.post1
     # via pydantic
 et-xmlfile==1.1.0
     # via openpyxl
+exceptiongroup==1.2.0
+    # via
+    #   anyio
+    #   cattrs
+    #   ipython
+    #   pytest
 executing==2.0.1
     # via stack-data
 fastapi==0.104.1
@@ -177,11 +188,15 @@ fastjsonschema==2.19.0
     # via
     #   -r requirements/main.in
     #   nbformat
+filelock==3.13.1
+    # via
+    #   tox
+    #   virtualenv
 fnc==0.5.3
     # via -r requirements/main.in
 fqdn==1.5.1
     # via jsonschema
-frozendict==2.3.8
+frozendict==2.3.9
     # via -r requirements/main.in
 fsspec==2023.10.0
     # via universal-pathlib
@@ -204,13 +219,13 @@ graphql-relay==3.2.0
     # via graphene
 graphviz==0.20.1
     # via linkml
-greenlet==2.0.1
-    # via prefixmaps
-grpcio==1.59.2
+greenlet==3.0.1
+    # via sqlalchemy
+grpcio==1.59.3
     # via
     #   dagster
     #   grpcio-health-checking
-grpcio-health-checking==1.59.2
+grpcio-health-checking==1.59.3
     # via dagster
 h11==0.14.0
     # via uvicorn
@@ -223,7 +238,7 @@ httptools==0.6.1
     # via uvicorn
 humanfriendly==10.0
     # via coloredlogs
-idna==3.4
+idna==3.6
     # via
     #   anyio
     #   email-validator
@@ -232,18 +247,16 @@ idna==3.4
     #   yarl
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.8.0
-    # via prefixmaps
 iniconfig==2.0.0
     # via pytest
-ipykernel==6.26.0
+ipykernel==6.27.1
     # via
     #   jupyter
     #   jupyter-console
     #   jupyterlab
     #   mkdocs-jupyter
     #   qtconsole
-ipython==8.17.2
+ipython==8.18.1
     # via
     #   ipykernel
     #   ipywidgets
@@ -302,7 +315,7 @@ jsonpointer==2.4
     # via
     #   jsonpatch
     #   jsonschema
-jsonschema==4.19.2
+jsonschema==4.20.0
     # via
     #   jsonschema
     #   jupyter-events
@@ -336,9 +349,9 @@ jupyter-core==5.5.0
     #   qtconsole
 jupyter-events==0.9.0
     # via jupyter-server
-jupyter-lsp==2.2.0
+jupyter-lsp==2.2.1
     # via jupyterlab
-jupyter-server==2.10.0
+jupyter-server==2.11.1
     # via
     #   jupyter-lsp
     #   jupyterlab
@@ -347,13 +360,13 @@ jupyter-server==2.10.0
     #   notebook-shim
 jupyter-server-terminals==0.4.4
     # via jupyter-server
-jupyterlab==4.0.8
+jupyterlab==4.0.9
     # via
     #   -r requirements/main.in
     #   notebook
-jupyterlab-pygments==0.2.2
+jupyterlab-pygments==0.3.0
     # via nbconvert
-jupyterlab-server==2.25.1
+jupyterlab-server==2.25.2
     # via
     #   jupyterlab
     #   notebook
@@ -365,7 +378,7 @@ lazy-model==0.2.0
     # via beanie
 libcst==1.1.0
     # via shed
-linkml==1.6.2
+linkml==1.6.3
     # via
     #   -r requirements/main.in
     #   nmdc-schema
@@ -413,15 +426,15 @@ mkdocs==1.5.3
     #   mkdocs-mermaid2-plugin
 mkdocs-jupyter==0.24.6
     # via -r requirements/main.in
-mkdocs-material==9.4.8
+mkdocs-material==9.4.14
     # via
     #   -r requirements/main.in
     #   mkdocs-jupyter
-mkdocs-material-extensions==1.3
+mkdocs-material-extensions==1.3.1
     # via mkdocs-material
 mkdocs-mermaid2-plugin==1.1.1
     # via -r requirements/main.in
-motor==3.3.1
+motor==3.3.2
     # via
     #   -r requirements/main.in
     #   beanie
@@ -446,7 +459,7 @@ nbformat==5.9.2
     #   nbconvert
 nest-asyncio==1.5.8
     # via ipykernel
-nmdc-schema==9.1.0
+nmdc-schema==9.2.0
     # via -r requirements/main.in
 notebook==7.0.6
     # via jupyter
@@ -455,7 +468,9 @@ notebook-shim==0.2.3
     #   jupyterlab
     #   notebook
 numpy==1.26.2
-    # via pandas
+    # via
+    #   pandas
+    #   terminusdb-client
 numpydoc==1.6.0
     # via terminusdb-client
 openpyxl==3.1.2
@@ -474,18 +489,22 @@ packaging==23.2
     #   jupyterlab-server
     #   mkdocs
     #   nbconvert
+    #   pyproject-api
     #   pytest
     #   qtconsole
     #   qtpy
     #   setuptools-scm
     #   sphinx
+    #   tox
 paginate==0.5.6
     # via mkdocs-material
 pandas==2.1.3
-    # via -r requirements/main.in
+    # via
+    #   -r requirements/main.in
+    #   terminusdb-client
 pandocfilters==1.5.0
     # via nbconvert
-parse==1.19.1
+parse==1.20.0
     # via linkml
 parso==0.8.3
     # via jedi
@@ -497,7 +516,7 @@ pathspec==0.11.2
     #   mkdocs
 pendulum==2.1.2
     # via dagster
-pexpect==4.8.0
+pexpect==4.9.0
     # via ipython
 platformdirs==4.0.0
     # via
@@ -505,25 +524,29 @@ platformdirs==4.0.0
     #   jupyter-core
     #   mkdocs
     #   requests-cache
+    #   tox
+    #   virtualenv
 pluggy==1.3.0
-    # via pytest
+    # via
+    #   pytest
+    #   tox
 ply==3.11
     # via jsonpath-ng
 prefixcommons==0.1.12
     # via
     #   linkml
     #   linkml-runtime
-prefixmaps==0.1.5
+prefixmaps==0.2.0
     # via
     #   linkml
     #   linkml-runtime
-prometheus-client==0.18.0
+prometheus-client==0.19.0
     # via jupyter-server
 prompt-toolkit==3.0.41
     # via
     #   ipython
     #   jupyter-console
-protobuf==4.25.0
+protobuf==4.25.1
     # via
     #   dagster
     #   grpcio-health-checking
@@ -537,13 +560,13 @@ ptyprocess==0.7.0
     #   terminado
 pure-eval==0.2.2
     # via stack-data
-pyasn1==0.5.0
+pyasn1==0.5.1
     # via
     #   python-jose
     #   rsa
 pycparser==2.21
     # via cffi
-pydantic==2.5.0
+pydantic==2.5.2
     # via
     #   -r requirements/main.in
     #   beanie
@@ -553,12 +576,13 @@ pydantic==2.5.0
     #   lazy-model
     #   linkml
     #   linkml-runtime
+    #   prefixmaps
     #   pydantic
-pydantic-core==2.14.1
+pydantic-core==2.14.5
     # via pydantic
 pyflakes==3.1.0
     # via autoflake
-pygments==2.16.1
+pygments==2.17.2
     # via
     #   ipython
     #   jupyter-console
@@ -572,16 +596,18 @@ pyjsg==0.11.10
     #   linkml
     #   pyshexc
     #   shexjsg
-pymdown-extensions==10.4
+pymdown-extensions==10.5
     # via
     #   mkdocs-material
     #   mkdocs-mermaid2-plugin
-pymongo==4.6.0
+pymongo==4.6.1
     # via
     #   -r requirements/main.in
     #   motor
 pyparsing==3.1.1
     # via rdflib
+pyproject-api==1.6.1
+    # via tox
 pyshex==0.8.1
     # via linkml
 pyshexc==0.9.1
@@ -650,7 +676,7 @@ pyzmq==25.1.1
     #   jupyter-console
     #   jupyter-server
     #   qtconsole
-qtconsole==5.5.0
+qtconsole==5.5.1
     # via jupyter
 qtpy==2.4.1
     # via qtconsole
@@ -670,7 +696,7 @@ rdflib-shim==1.0.3
     #   pyshex
     #   pyshexc
     #   sparqlslurper
-referencing==0.31.0
+referencing==0.31.1
     # via
     #   jsonschema
     #   jsonschema-specifications
@@ -695,7 +721,7 @@ requests==2.31.0
     #   requests-toolbelt
     #   sphinx
     #   terminusdb-client
-requests-cache==1.1.0
+requests-cache==1.1.1
     # via -r requirements/main.in
 requests-toolbelt==0.10.1
     # via gql
@@ -709,7 +735,7 @@ rfc3986-validator==0.1.1
     #   jupyter-events
 rfc3987==1.3.8
     # via jsonschema
-rpds-py==0.12.0
+rpds-py==0.13.2
     # via
     #   jsonschema
     #   referencing
@@ -721,7 +747,7 @@ ruamel-yaml==0.18.5
     #   nmdc-schema
 ruamel-yaml-clib==0.2.8
     # via ruamel-yaml
-s3transfer==0.7.0
+s3transfer==0.8.2
     # via boto3
 semver==3.0.2
     # via -r requirements/main.in
@@ -803,7 +829,7 @@ terminado==0.18.0
     # via
     #   jupyter-server
     #   jupyter-server-terminals
-terminusdb-client==10.2.4
+terminusdb-client==10.2.6
     # via -r requirements/main.in
 tinycss2==1.2.1
     # via nbconvert
@@ -814,12 +840,21 @@ toml==0.10.2
     #   beanie
     #   jupytext
 tomli==2.0.1
-    # via dagster
+    # via
+    #   autoflake
+    #   black
+    #   dagster
+    #   jupyterlab
+    #   numpydoc
+    #   pyproject-api
+    #   pytest
+    #   setuptools-scm
+    #   tox
 toolz==0.12.0
     # via -r requirements/main.in
 toposort==1.10
     # via dagster
-tornado==6.3.3
+tornado==6.4
     # via
     #   ipykernel
     #   jupyter-client
@@ -827,12 +862,14 @@ tornado==6.3.3
     #   jupyterlab
     #   notebook
     #   terminado
+tox==4.11.4
+    # via prefixmaps
 tqdm==4.66.1
     # via
     #   -r requirements/main.in
     #   dagster
     #   terminusdb-client
-traitlets==5.13.0
+traitlets==5.14.0
     # via
     #   comm
     #   ipykernel
@@ -856,15 +893,19 @@ types-python-dateutil==2.8.19.14
 typing-extensions==4.8.0
     # via
     #   alembic
+    #   async-lru
+    #   beanie
+    #   black
+    #   cattrs
     #   dagster
     #   fastapi
     #   libcst
-    #   prefixmaps
     #   pydantic
     #   pydantic-core
     #   setuptools-scm
     #   sqlalchemy
     #   typing-inspect
+    #   uvicorn
 typing-inspect==0.9.0
     # via libcst
 tzdata==2023.3
@@ -888,6 +929,8 @@ uvicorn==0.24.0.post1
     #   dagster-webserver
 uvloop==0.19.0
     # via uvicorn
+virtualenv==20.24.7
+    # via tox
 watchdog==3.0.0
     # via
     #   dagster
@@ -895,7 +938,7 @@ watchdog==3.0.0
     #   mkdocs
 watchfiles==0.21.0
     # via uvicorn
-wcwidth==0.2.10
+wcwidth==0.2.12
     # via prompt-toolkit
 webcolors==1.13
     # via jsonschema
@@ -915,13 +958,11 @@ xlrd==2.0.1
     # via -r requirements/main.in
 xlsxwriter==3.1.9
     # via -r requirements/main.in
-yarl==1.9.2
+yarl==1.9.3
     # via gql
-zipp==3.17.0
-    # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
-setuptools==68.2.2
+setuptools==69.0.2
     # via
     #   dagster
     #   mkdocs-mermaid2-plugin