From 2068467732e902671974750bc4e7ab00d11fea5e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 19:50:50 -0700 Subject: [PATCH 1/7] Make origin and transformer database names configurable (not hard coded) --- .../notebooks/.notebook.env.example | 2 ++ demo/metadata_migration/notebooks/helpers.py | 20 +++++++++++++++++++ .../notebooks/test_helpers.py | 8 +++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/demo/metadata_migration/notebooks/.notebook.env.example b/demo/metadata_migration/notebooks/.notebook.env.example index 187c7197..42633f43 100644 --- a/demo/metadata_migration/notebooks/.notebook.env.example +++ b/demo/metadata_migration/notebooks/.notebook.env.example @@ -12,9 +12,11 @@ ORIGIN_MONGO_HOST="__REPLACE_ME__" ORIGIN_MONGO_PORT="__REPLACE_ME__" ORIGIN_MONGO_USERNAME="__REPLACE_ME__" ORIGIN_MONGO_PASSWORD="__REPLACE_ME__" +ORIGIN_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc" # Connection parameters for the Transformer Mongo server (typically a local server). TRANSFORMER_MONGO_HOST="__REPLACE_ME__" TRANSFORMER_MONGO_PORT="__REPLACE_ME__" TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__" TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__" +TRANSFORMER_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc_transformed" diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index f8ff9167..5eccfa89 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -78,11 +78,27 @@ def parse_and_validate_notebook_config_file( origin_mongo_port = notebook_config["ORIGIN_MONGO_PORT"] origin_mongo_username = notebook_config["ORIGIN_MONGO_USERNAME"] origin_mongo_password = notebook_config["ORIGIN_MONGO_PASSWORD"] + origin_mongo_database_name = notebook_config["ORIGIN_MONGO_DATABASE_NAME"] transformer_mongo_host = notebook_config["TRANSFORMER_MONGO_HOST"] transformer_mongo_port = notebook_config["TRANSFORMER_MONGO_PORT"] transformer_mongo_username = notebook_config["TRANSFORMER_MONGO_USERNAME"] transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"] + transformer_mongo_database_name = notebook_config["TRANSFORMER_MONGO_DATABASE_NAME"] + + # Validate the database names. + if origin_mongo_database_name.strip() == "": + raise ValueError(f"Origin database name cannot be empty") + if transformer_mongo_database_name.strip() == "": + raise ValueError(f"Transformer database name cannot be empty") + if all([ + origin_mongo_host == transformer_mongo_host, + origin_mongo_port == transformer_mongo_port, + origin_mongo_database_name == transformer_mongo_database_name, + ]): + # Note: We don't allow the use of the origin database as the transformer, + # because that would prevent us from easily aborting the migration. + raise ValueError(f"The origin and transformer cannot both be the same database") return dict( origin_dump_folder_path=origin_dump_folder_path, @@ -94,10 +110,12 @@ def parse_and_validate_notebook_config_file( origin_mongo_port=origin_mongo_port, origin_mongo_username=origin_mongo_username, origin_mongo_password=origin_mongo_password, + origin_mongo_database_name=origin_mongo_database_name, transformer_mongo_host=transformer_mongo_host, transformer_mongo_port=transformer_mongo_port, transformer_mongo_username=transformer_mongo_username, transformer_mongo_password=transformer_mongo_password, + transformer_mongo_database_name=transformer_mongo_database_name, ) def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: @@ -114,10 +132,12 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None: self.origin_mongo_port = notebook_config["origin_mongo_port"] self.origin_mongo_username = notebook_config["origin_mongo_username"] self.origin_mongo_password = notebook_config["origin_mongo_password"] + self.origin_mongo_database_name = notebook_config["origin_mongo_database_name"] self.transformer_mongo_host = notebook_config["transformer_mongo_host"] self.transformer_mongo_port = notebook_config["transformer_mongo_port"] self.transformer_mongo_username = notebook_config["transformer_mongo_username"] self.transformer_mongo_password = notebook_config["transformer_mongo_password"] + self.transformer_mongo_database_name = notebook_config["transformer_mongo_database_name"] def setup_logger( diff --git a/demo/metadata_migration/notebooks/test_helpers.py b/demo/metadata_migration/notebooks/test_helpers.py index 05b3617c..20cf0408 100644 --- a/demo/metadata_migration/notebooks/test_helpers.py +++ b/demo/metadata_migration/notebooks/test_helpers.py @@ -71,10 +71,12 @@ def test_init_method(self): origin_mongo_port = "11111" origin_mongo_username = "origin_username" origin_mongo_password = "origin_password" + origin_mongo_database_name = "origin_database_name" transformer_mongo_host = "transformer" transformer_mongo_port = "22222" transformer_mongo_username = "transformer_username" - transformer_mongo_password = "transformer_password" + transformer_mongo_password = "transformer_password" + transformer_mongo_database_name = "transformer_database_name" origin_mongo_yaml = f"uri: {origin_mongo_server_uri}\n" transformer_mongo_yaml = f"uri: {transformer_mongo_server_uri}\n" origin_mongo_config_file.write(origin_mongo_yaml.encode("utf-8")) @@ -100,10 +102,12 @@ def test_init_method(self): ORIGIN_MONGO_PORT=origin_mongo_port, ORIGIN_MONGO_USERNAME=origin_mongo_username, ORIGIN_MONGO_PASSWORD=origin_mongo_password, + ORIGIN_MONGO_DATABASE_NAME=origin_mongo_database_name, TRANSFORMER_MONGO_HOST=transformer_mongo_host, TRANSFORMER_MONGO_PORT=transformer_mongo_port, TRANSFORMER_MONGO_USERNAME=transformer_mongo_username, TRANSFORMER_MONGO_PASSWORD=transformer_mongo_password, + TRANSFORMER_MONGO_DATABASE_NAME=transformer_mongo_database_name, ) for key, value in notebook_config_values.items(): notebook_config_file.write(f"{key} = {value}\n".encode("utf-8")) @@ -121,10 +125,12 @@ def test_init_method(self): assert cfg.origin_mongo_port == origin_mongo_port assert cfg.origin_mongo_username == origin_mongo_username assert cfg.origin_mongo_password == origin_mongo_password + assert cfg.origin_mongo_database_name == origin_mongo_database_name assert cfg.transformer_mongo_host == transformer_mongo_host assert cfg.transformer_mongo_port == transformer_mongo_port assert cfg.transformer_mongo_username == transformer_mongo_username assert cfg.transformer_mongo_password == transformer_mongo_password + assert cfg.transformer_mongo_database_name == transformer_mongo_database_name # Delete the temporary directories (i.e. clean up). shutil.rmtree(origin_dump_folder_path) From f1104d96351dd2b9266b420913eb827fa2ee3c61 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 19:56:36 -0700 Subject: [PATCH 2/7] Delete obsolete variables from test --- .../notebooks/test_helpers.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/demo/metadata_migration/notebooks/test_helpers.py b/demo/metadata_migration/notebooks/test_helpers.py index 20cf0408..a0926b27 100644 --- a/demo/metadata_migration/notebooks/test_helpers.py +++ b/demo/metadata_migration/notebooks/test_helpers.py @@ -53,43 +53,33 @@ def test_make_mongo_cli_base_options(self): """.strip() def test_init_method(self): - with (TempFile() as notebook_config_file, - TempFile() as origin_mongo_config_file, - TempFile() as transformer_mongo_config_file, - TempFile() as mongodump_binary, - TempFile() as mongorestore_binary, + with (TempFile() as notebook_config_file, + TempFile() as mongodump_binary, + TempFile() as mongorestore_binary, TempFile() as mongosh_binary): # Create named temporary directories and get their paths. origin_dump_folder_path = mkdtemp() transformer_dump_folder_path = mkdtemp() - # Populate the Mongo config files, then reset their file pointers. - origin_mongo_server_uri = f"mongodb://u:p@origin:12345" - transformer_mongo_server_uri = f"mongodb://u:p@transformer:12345" + # Define Mongo server connection parameters. origin_mongo_host = "origin" origin_mongo_port = "11111" origin_mongo_username = "origin_username" origin_mongo_password = "origin_password" - origin_mongo_database_name = "origin_database_name" transformer_mongo_host = "transformer" transformer_mongo_port = "22222" transformer_mongo_username = "transformer_username" transformer_mongo_password = "transformer_password" + + # Define Mongo database selection parameters. + origin_mongo_database_name = "origin_database_name" transformer_mongo_database_name = "transformer_database_name" - origin_mongo_yaml = f"uri: {origin_mongo_server_uri}\n" - transformer_mongo_yaml = f"uri: {transformer_mongo_server_uri}\n" - origin_mongo_config_file.write(origin_mongo_yaml.encode("utf-8")) - transformer_mongo_config_file.write(transformer_mongo_yaml.encode("utf-8")) - origin_mongo_config_file.seek(0) - transformer_mongo_config_file.seek(0) # Use familiar aliases in an attempt to facilitate writing the `assert` section below. mongodump_path = mongodump_binary.name mongorestore_path = mongorestore_binary.name mongosh_path = mongosh_binary.name - origin_mongo_config_file_path = origin_mongo_config_file.name - transformer_mongo_config_file_path = transformer_mongo_config_file.name # Populate the notebook config file, then reset its file pointer. notebook_config_values = dict( From 08d9f33196fe27c3c434e66036bae5bc215b4b79 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 20:37:57 -0700 Subject: [PATCH 3/7] Update notebook to use database names defined in configuration file --- .../notebooks/migrate_10_9_1_to_11_0_0.ipynb | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb index b50d3494..c74da0b3 100644 --- a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb @@ -52,7 +52,7 @@ "\n", "Here, you'll prepare an environment for running this notebook.\n", "\n", - "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database having the name specified in the notebook configuration file).\n", " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command. A MongoDB server started this way will be accessible without a username or password.\n" ] }, @@ -241,13 +241,13 @@ " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", "\n", " # Sanity test: Ensure the origin database exists.\n", - " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + " assert cfg.origin_mongo_database_name in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", "\n", " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", "\n", " # Sanity test: Ensure the transformation database does not exist.\n", - " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + " assert cfg.transformer_mongo_database_name not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" ], "outputs": [], "execution_count": null @@ -257,7 +257,7 @@ "id": "1e195db1", "metadata": {}, "source": [ - "Delete the \"nmdc\" database from the transformer MongoDB server if that database already exists there (e.g. if it was left over from an experiment).\n", + "Delete the transformer database from the transformer MongoDB server if that database already exists there (e.g. if it was left over from an experiment).\n", "\n", "##### Description\n", "\n", @@ -277,7 +277,7 @@ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} {transformer_mongo_cli_base_options} \\\n", " --quiet \\\n", - " --eval 'use nmdc' \\\n", + " --eval 'use {cfg.transformer_mongo_database_name}' \\\n", " --eval 'db.dropDatabase()'\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", @@ -428,7 +428,7 @@ "# Dump all collections from the \"origin\" database.\n", "shell_command = f\"\"\"\n", " {mongodump} {origin_mongo_cli_base_options} \\\n", - " --db='nmdc' \\\n", + " --db='{cfg.origin_mongo_database_name}' \\\n", " --gzip \\\n", " --out='{cfg.origin_dump_folder_path}'\n", "\"\"\"\n", @@ -445,7 +445,11 @@ "source": [ "### Load the dumped collections into the \"transformer\" MongoDB server\n", "\n", - "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server." + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", + "\n", + "References:\n", + "- https://www.mongodb.com/docs/database-tools/mongorestore/#std-option-mongorestore\n", + "- https://www.mongodb.com/docs/database-tools/mongorestore/mongorestore-examples/#copy-clone-a-database" ] }, { @@ -462,6 +466,8 @@ " --drop \\\n", " --preserveUUID \\\n", " --stopOnError \\\n", + " --nsFrom='{cfg.origin_mongo_database_name}.*' \\\n", + " --nsTo='{cfg.transformer_mongo_database_name}.*' \\\n", " --dir='{cfg.origin_dump_folder_path}'\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", @@ -493,7 +499,7 @@ "source": [ "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", "adapter = MongoAdapter(\n", - " database=transformer_mongo_client[\"nmdc\"],\n", + " database=transformer_mongo_client[cfg.transformer_mongo_database_name],\n", " on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", " on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", " on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", @@ -537,7 +543,7 @@ " ordered_collection_names.append(large_collection_name) # puts it last\n", "\n", "for collection_name in ordered_collection_names:\n", - " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " collection = transformer_mongo_client[cfg.transformer_mongo_database_name][collection_name]\n", " num_documents_in_collection = collection.count_documents({})\n", " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents)\", end=\"\\t\") # no newline\n", "\n", @@ -593,7 +599,7 @@ "# Dump the database from the \"transformer\" MongoDB server.\n", "shell_command = f\"\"\"\n", " {mongodump} {transformer_mongo_cli_base_options} \\\n", - " --db='nmdc' \\\n", + " --db='{cfg.transformer_mongo_database_name}' \\\n", " --gzip \\\n", " --out='{cfg.transformer_dump_folder_path}'\n", "\"\"\"\n", @@ -655,7 +661,7 @@ "source": [ "### Drop the original collections from the \"origin\" MongoDB server\n", "\n", - "This is necessary for situations where collections were renamed or deleted. (The `--drop` option of `mongorestore` only drops collections that exist in the dump.)" + "This is necessary for situations where collections were renamed or deleted. (The `--drop` option of `mongorestore` would only drop collections that exist in the dump being restored, which would not include renamed or deleted collections.)" ] }, { @@ -665,7 +671,7 @@ "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", - " --eval 'use nmdc' \\\n", + " --eval 'use {cfg.origin_mongo_database_name}' \\\n", " --eval 'db.dropDatabase()'\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", @@ -699,6 +705,8 @@ " --dir='{cfg.transformer_dump_folder_path}' \\\n", " --drop \\\n", " --preserveUUID \\\n", + " --nsFrom='{cfg.transformer_mongo_database_name}.*' \\\n", + " --nsTo='{cfg.origin_mongo_database_name}.*' \\\n", " --stopOnError\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", From dedfe764cc5c1dbfd2bc70b01210895170aff8f6 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 20:42:20 -0700 Subject: [PATCH 4/7] Reorder CLI options to make them easier to read --- .../notebooks/migrate_10_9_1_to_11_0_0.ipynb | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb index c74da0b3..7803434b 100644 --- a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb @@ -276,9 +276,9 @@ "# because I expect to eventually use regular Python scripts—not Python notebooks—for migrations.\n", "shell_command = f\"\"\"\n", " {cfg.mongosh_path} {transformer_mongo_cli_base_options} \\\n", - " --quiet \\\n", " --eval 'use {cfg.transformer_mongo_database_name}' \\\n", - " --eval 'db.dropDatabase()'\n", + " --eval 'db.dropDatabase()' \\\n", + " --quiet\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" @@ -376,8 +376,8 @@ "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", - " --quiet \\\n", - " --file='mongosh-scripts/revoke-privileges.mongo.js'\n", + " --file='mongosh-scripts/revoke-privileges.mongo.js' \\\n", + " --quiet\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" @@ -429,8 +429,8 @@ "shell_command = f\"\"\"\n", " {mongodump} {origin_mongo_cli_base_options} \\\n", " --db='{cfg.origin_mongo_database_name}' \\\n", - " --gzip \\\n", - " --out='{cfg.origin_dump_folder_path}'\n", + " --out='{cfg.origin_dump_folder_path}' \\\n", + " --gzip\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" @@ -462,13 +462,13 @@ "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", "shell_command = f\"\"\"\n", " {mongorestore} {transformer_mongo_cli_base_options} \\\n", - " --gzip \\\n", - " --drop \\\n", - " --preserveUUID \\\n", - " --stopOnError \\\n", " --nsFrom='{cfg.origin_mongo_database_name}.*' \\\n", " --nsTo='{cfg.transformer_mongo_database_name}.*' \\\n", - " --dir='{cfg.origin_dump_folder_path}'\n", + " --dir='{cfg.origin_dump_folder_path}' \\\n", + " --preserveUUID \\\n", + " --stopOnError \\\n", + " --drop \\\n", + " --gzip\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" @@ -600,8 +600,8 @@ "shell_command = f\"\"\"\n", " {mongodump} {transformer_mongo_cli_base_options} \\\n", " --db='{cfg.transformer_mongo_database_name}' \\\n", - " --gzip \\\n", - " --out='{cfg.transformer_dump_folder_path}'\n", + " --out='{cfg.transformer_dump_folder_path}' \\\n", + " --gzip\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\") " @@ -700,14 +700,14 @@ "# Load the transformed collections into the origin server, replacing any same-named ones that are there.\n", "shell_command = f\"\"\"\n", " {mongorestore} {origin_mongo_cli_base_options} \\\n", - " --gzip \\\n", - " --verbose \\\n", - " --dir='{cfg.transformer_dump_folder_path}' \\\n", - " --drop \\\n", - " --preserveUUID \\\n", " --nsFrom='{cfg.transformer_mongo_database_name}.*' \\\n", " --nsTo='{cfg.origin_mongo_database_name}.*' \\\n", - " --stopOnError\n", + " --dir='{cfg.transformer_dump_folder_path}' \\\n", + " --preserveUUID \\\n", + " --stopOnError \\\n", + " --verbose \\\n", + " --drop \\\n", + " --gzip\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\") " @@ -762,8 +762,8 @@ "source": [ "shell_command = f\"\"\"\n", " {cfg.mongosh_path} {origin_mongo_cli_base_options} \\\n", - " --quiet \\\n", - " --file='mongosh-scripts/restore-privileges.mongo.js'\n", + " --file='mongosh-scripts/restore-privileges.mongo.js' \\\n", + " --quiet\n", "\"\"\"\n", "completed_process = subprocess.run(shell_command, shell=True)\n", "print(f\"\\nReturn code: {completed_process.returncode}\")" From 0ecb288909df4e6fc48aeaed4e603d1be47ac257 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 21:57:02 -0700 Subject: [PATCH 5/7] Use LinkML validator instead of JSON Schema validator for validation --- demo/metadata_migration/notebooks/helpers.py | 45 +++++++++++++++++ .../notebooks/migrate_10_9_1_to_11_0_0.ipynb | 49 ++++++++----------- .../notebooks/requirements.txt | 2 +- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/demo/metadata_migration/notebooks/helpers.py b/demo/metadata_migration/notebooks/helpers.py index 5eccfa89..a72b6abe 100644 --- a/demo/metadata_migration/notebooks/helpers.py +++ b/demo/metadata_migration/notebooks/helpers.py @@ -2,6 +2,7 @@ from typing import Dict, Optional, List import logging from datetime import datetime +from functools import cache from dotenv import dotenv_values from linkml_runtime import SchemaView @@ -174,6 +175,9 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]: Returns the names of the slots of the `Database` class that describe database collections. :param schema_view: A `SchemaView` instance + + Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py + with permission from its author. """ collection_names = [] @@ -190,3 +194,44 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]: collection_names = list(set(collection_names)) return collection_names + + +@cache # memoizes the decorated function +def translate_class_uri_into_schema_class_name(schema_view: SchemaView, class_uri: str) -> Optional[str]: + r""" + Returns the name of the schema class that has the specified value as its `class_uri`. + + Example: "nmdc:Biosample" (a `class_uri` value) -> "Biosample" (a class name) + + References: + - https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.all_classes + - https://linkml.io/linkml/code/metamodel.html#linkml_runtime.linkml_model.meta.ClassDefinition.class_uri + + Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py + with permission from its author. + """ + schema_class_name = None + all_class_definitions_in_schema = schema_view.all_classes() + for class_name, class_definition in all_class_definitions_in_schema.items(): + if class_definition.class_uri == class_uri: + schema_class_name = class_definition.name + break + return schema_class_name + + +def derive_schema_class_name_from_document(schema_view: SchemaView, document: dict) -> Optional[str]: + r""" + Returns the name of the schema class, if any, of which the specified document claims to represent an instance. + + This function is written under the assumption that the document has a `type` field whose value is the `class_uri` + belonging to the schema class of which the document represents an instance. Slot definition for such a field: + https://github.com/microbiomedata/berkeley-schema-fy24/blob/fc2d9600/src/schema/basic_slots.yaml#L420-L436 + + Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py + with permission from its author. + """ + schema_class_name = None + if "type" in document and isinstance(document["type"], str): + class_uri = document["type"] + schema_class_name = translate_class_uri_into_schema_class_name(schema_view, class_uri) + return schema_class_name diff --git a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb index 7803434b..870bdd1e 100644 --- a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb @@ -60,9 +60,7 @@ "cell_type": "code", "id": "8aee55e3", "metadata": {}, - "source": [ - "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4" - ], + "source": "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4", "outputs": [], "execution_count": null }, @@ -136,9 +134,10 @@ ] }, { - "cell_type": "code", - "id": "dbecd561", "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ "# Standard library packages:\n", "import subprocess\n", @@ -146,8 +145,8 @@ "\n", "# Third-party packages:\n", "import pymongo\n", - "from jsonschema import Draft7Validator as JSONSchemaValidator\n", - "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict, SchemaVariantIdentifier, get_nmdc_schema_definition\n", + "from linkml.validator import Validator, ValidationReport\n", + "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "from linkml_runtime import SchemaView\n", "\n", @@ -156,11 +155,10 @@ "from nmdc_schema.migrators.migrator_from_10_2_0_to_11_0_0 import Migrator\n", "\n", "# First-party packages:\n", - "from helpers import Config, setup_logger, get_collection_names_from_schema\n", + "from helpers import Config, setup_logger, get_collection_names_from_schema, derive_schema_class_name_from_document\n", "from bookkeeper import Bookkeeper, MigrationEvent" ], - "outputs": [], - "execution_count": null + "id": "dbecd561" }, { "cell_type": "markdown", @@ -296,9 +294,12 @@ } }, "source": [ - "### Create JSON Schema validator\n", + "### Create validator\n", + "\n", + "In this step, you'll create a validator that can be used to check whether data conforms to the NMDC Schema. You'll use it later, to do that.\n", "\n", - "In this step, you'll create a JSON Schema validator for the NMDC Schema." + "References:\n", + "- https://linkml.io/linkml/code/validator.html#linkml.validator.Validator" ] }, { @@ -311,15 +312,11 @@ } }, "source": [ - "nmdc_jsonschema: dict = get_nmdc_jsonschema_dict(variant=SchemaVariantIdentifier.nmdc_materialized_patterns)\n", - "nmdc_jsonschema_validator = JSONSchemaValidator(nmdc_jsonschema)\n", + "schema_definition = get_nmdc_schema_definition()\n", + "validator = Validator(schema=schema_definition)\n", "\n", - "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", - "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", - "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", - "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", - "\n", - "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + "# Perform a sanity test of the validator.\n", + "assert callable(validator.validate), \"Failed to instantiate a validator\"" ], "outputs": [], "execution_count": null @@ -558,15 +555,11 @@ " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", " #\n", - " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", - " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", - " #\n", - " # TODO: Use `linkml.validator` — instead of `jsonschema` — to validate the document.\n", - " # Reference: https://linkml.io/linkml/data/validating-data.html\n", - " #\n", + " schema_class_name = derive_schema_class_name_from_document(schema_view=schema_view, document=document)\n", " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", - " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", - " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " validation_report: ValidationReport = validator.validate(document_without_underscore_id_key, schema_class_name)\n", + " if len(validation_report.results) > 0:\n", + " raise TypeError(validation_report)\n", "\n", " # Show a \"sign of life\" each time we validate an additional 10% of the documents in the collection.\n", " # Note: The `//` operator performs \"floor division,\" returning the largest integer <= the real quotient.\n", diff --git a/demo/metadata_migration/notebooks/requirements.txt b/demo/metadata_migration/notebooks/requirements.txt index 63794fcc..4d95feb1 100644 --- a/demo/metadata_migration/notebooks/requirements.txt +++ b/demo/metadata_migration/notebooks/requirements.txt @@ -1,5 +1,5 @@ dictdiffer==0.9.0 -jsonschema==4.19.2 +linkml==1.8.5 pymongo==4.7.2 python-dotenv==1.0.0 PyYAML==6.0.1 From 5bd3b1ac0a534d65a4b5245e5799c1c5ba54aa3e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 1 Nov 2024 23:15:42 -0700 Subject: [PATCH 6/7] Fix typo in comment --- demo/metadata_migration/notebooks/.notebook.env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/metadata_migration/notebooks/.notebook.env.example b/demo/metadata_migration/notebooks/.notebook.env.example index 42633f43..066392b6 100644 --- a/demo/metadata_migration/notebooks/.notebook.env.example +++ b/demo/metadata_migration/notebooks/.notebook.env.example @@ -19,4 +19,4 @@ TRANSFORMER_MONGO_HOST="__REPLACE_ME__" TRANSFORMER_MONGO_PORT="__REPLACE_ME__" TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__" TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__" -TRANSFORMER_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc_transformed" +TRANSFORMER_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc_transformer" From 4844c4b61bcebe1be0b688777e15cc7f62f550d8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sat, 2 Nov 2024 00:00:49 -0700 Subject: [PATCH 7/7] Remove `--preserveUUID` CLI option from mongorestore commands That's because it cannot be used when restoring into the same server as that on which the dumped database resides. --- .../notebooks/migrate_10_9_1_to_11_0_0.ipynb | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb index 870bdd1e..bd3f026e 100644 --- a/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_9_1_to_11_0_0.ipynb @@ -60,7 +60,9 @@ "cell_type": "code", "id": "8aee55e3", "metadata": {}, - "source": "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4", + "source": [ + "!docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4" + ], "outputs": [], "execution_count": null }, @@ -134,10 +136,9 @@ ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], - "execution_count": null, + "id": "dbecd561", + "metadata": {}, "source": [ "# Standard library packages:\n", "import subprocess\n", @@ -146,6 +147,7 @@ "# Third-party packages:\n", "import pymongo\n", "from linkml.validator import Validator, ValidationReport\n", + "from linkml.validator.plugins import JsonschemaValidationPlugin\n", "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", "from linkml_runtime import SchemaView\n", @@ -158,7 +160,8 @@ "from helpers import Config, setup_logger, get_collection_names_from_schema, derive_schema_class_name_from_document\n", "from bookkeeper import Bookkeeper, MigrationEvent" ], - "id": "dbecd561" + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -299,7 +302,8 @@ "In this step, you'll create a validator that can be used to check whether data conforms to the NMDC Schema. You'll use it later, to do that.\n", "\n", "References:\n", - "- https://linkml.io/linkml/code/validator.html#linkml.validator.Validator" + "- https://linkml.io/linkml/code/validator.html#linkml.validator.Validator\n", + "- https://linkml.io/linkml/data/validating-data.html#validation-in-python-code" ] }, { @@ -313,7 +317,10 @@ }, "source": [ "schema_definition = get_nmdc_schema_definition()\n", - "validator = Validator(schema=schema_definition)\n", + "validator = Validator(\n", + " schema=schema_definition,\n", + " validation_plugins=[JsonschemaValidationPlugin(closed=True)],\n", + ")\n", "\n", "# Perform a sanity test of the validator.\n", "assert callable(validator.validate), \"Failed to instantiate a validator\"" @@ -462,7 +469,6 @@ " --nsFrom='{cfg.origin_mongo_database_name}.*' \\\n", " --nsTo='{cfg.transformer_mongo_database_name}.*' \\\n", " --dir='{cfg.origin_dump_folder_path}' \\\n", - " --preserveUUID \\\n", " --stopOnError \\\n", " --drop \\\n", " --gzip\n", @@ -542,7 +548,7 @@ "for collection_name in ordered_collection_names:\n", " collection = transformer_mongo_client[cfg.transformer_mongo_database_name][collection_name]\n", " num_documents_in_collection = collection.count_documents({})\n", - " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents)\", end=\"\\t\") # no newline\n", + " print(f\"Validating collection {collection_name} ({num_documents_in_collection} documents)\", end=\" \") # no newline\n", "\n", " num_documents_in_collection_validated = 0\n", " for document in collection.find():\n", @@ -567,7 +573,7 @@ " if num_documents_in_collection_validated % (num_documents_in_collection // 10) == 0:\n", " print(\".\", end=\"\") # no newline\n", " \n", - " print(f\"Done\")" + " print(f\" Done\")" ], "outputs": [], "execution_count": null @@ -696,7 +702,6 @@ " --nsFrom='{cfg.transformer_mongo_database_name}.*' \\\n", " --nsTo='{cfg.origin_mongo_database_name}.*' \\\n", " --dir='{cfg.transformer_dump_folder_path}' \\\n", - " --preserveUUID \\\n", " --stopOnError \\\n", " --verbose \\\n", " --drop \\\n", @@ -767,7 +772,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" },