Skip to content

Commit

Permalink
Merge pull request #553 from microbiomedata/519-migrations-implement-…
Browse files Browse the repository at this point in the history
…notebook-that-runs-all-berkeley-schema-migrators
  • Loading branch information
eecavanna authored Aug 28, 2024
2 parents c2a7802 + e832cf1 commit 7955875
Show file tree
Hide file tree
Showing 10 changed files with 1,050 additions and 114 deletions.
5 changes: 2 additions & 3 deletions demo/metadata_migration/notebooks/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/.notebook.env
/.mongo.origin.yaml
/.mongo.transformer.yaml
/mongodump.*.out
/tmp.*
/tmp.*
/*_migration.log
29 changes: 0 additions & 29 deletions demo/metadata_migration/notebooks/.mongo.yaml.example

This file was deleted.

21 changes: 15 additions & 6 deletions demo/metadata_migration/notebooks/.notebook.env.example
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
# Paths to Mongo config files.
PATH_TO_ORIGIN_MONGO_CONFIG_FILE = "./.mongo.origin.yaml"
PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE = "./.mongo.transformer.yaml"

# Paths to folders in which the notebook will store Mongo dumps.
PATH_TO_ORIGIN_MONGO_DUMP_FOLDER = "./mongodump.origin.out"
PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER = "./mongodump.transformer.out"

# These are absolute paths to the `mongodump` and `mongorestore` programs.
PATH_TO_MONGODUMP_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongodump"
# These are absolute paths to the `mongodump`, `mongorestore`, and `mongosh` programs.
PATH_TO_MONGODUMP_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongodump"
PATH_TO_MONGORESTORE_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongodb-database-tools-macos-arm64-100.7.4/bin/mongorestore"
PATH_TO_MONGOSH_BINARY = "__REPLACE_ME__" # e.g. "/Users/Alice/Downloads/mongosh-1.10.6-darwin-x64/bin/mongosh"

# Connection parameters for the Origin Mongo server (typically a remote serve).
ORIGIN_MONGO_HOST="__REPLACE_ME__"
ORIGIN_MONGO_PORT="__REPLACE_ME__"
ORIGIN_MONGO_USERNAME="__REPLACE_ME__"
ORIGIN_MONGO_PASSWORD="__REPLACE_ME__"

# Connection parameters for the Transformer Mongo server (typically a local server).
TRANSFORMER_MONGO_HOST="__REPLACE_ME__"
TRANSFORMER_MONGO_PORT="__REPLACE_ME__"
TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__"
TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__"
161 changes: 96 additions & 65 deletions demo/metadata_migration/notebooks/helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from pathlib import Path
import re
from typing import Dict
from typing import Dict, Optional, List
import logging
from datetime import datetime

from dotenv import dotenv_values
import yaml
from linkml_runtime import SchemaView


DATABASE_CLASS_NAME = "Database"


class Config:
Expand All @@ -21,27 +25,9 @@ def parse_and_validate_notebook_config_file(
# Parse the notebook config file.
notebook_config = dotenv_values(notebook_config_file_path)

# Validate the Mongo config file paths.
origin_mongo_config_file_path = notebook_config[
"PATH_TO_ORIGIN_MONGO_CONFIG_FILE"
]
transformer_mongo_config_file_path = notebook_config[
"PATH_TO_TRANSFORMER_MONGO_CONFIG_FILE"
]
if not Path(origin_mongo_config_file_path).is_file():
raise FileNotFoundError(
f"Origin Mongo config file not found at: {origin_mongo_config_file_path}"
)
if not Path(transformer_mongo_config_file_path).is_file():
raise FileNotFoundError(
f"Transformer Mongo config file not found at: {transformer_mongo_config_file_path}"
)

# Validate the dump folder paths.
origin_dump_folder_path = notebook_config["PATH_TO_ORIGIN_MONGO_DUMP_FOLDER"]
transformer_dump_folder_path = notebook_config[
"PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER"
]
transformer_dump_folder_path = notebook_config["PATH_TO_TRANSFORMER_MONGO_DUMP_FOLDER"]
if not Path(origin_dump_folder_path).parent.is_dir():
raise FileNotFoundError(
f"Parent folder of {origin_dump_folder_path} (origin Mongo dump folder path) not found."
Expand All @@ -54,62 +40,107 @@ def parse_and_validate_notebook_config_file(
# Validate the binary paths.
mongodump_path = notebook_config["PATH_TO_MONGODUMP_BINARY"]
mongorestore_path = notebook_config["PATH_TO_MONGORESTORE_BINARY"]
mongosh_path = notebook_config["PATH_TO_MONGOSH_BINARY"]
if not Path(mongodump_path).is_file():
raise FileNotFoundError(f"mongodump binary not found at: {mongodump_path}")
if not Path(mongorestore_path).is_file():
raise FileNotFoundError(
f"mongorestore binary not found at: {mongorestore_path}"
)
raise FileNotFoundError(f"mongorestore binary not found at: {mongorestore_path}")
if not Path(mongosh_path).is_file():
raise FileNotFoundError(f"mongosh binary not found at: {mongosh_path}")

origin_mongo_host = notebook_config["ORIGIN_MONGO_HOST"]
origin_mongo_port = notebook_config["ORIGIN_MONGO_PORT"]
origin_mongo_username = notebook_config["ORIGIN_MONGO_USERNAME"]
origin_mongo_password = notebook_config["ORIGIN_MONGO_PASSWORD"]

transformer_mongo_host = notebook_config["TRANSFORMER_MONGO_HOST"]
transformer_mongo_port = notebook_config["TRANSFORMER_MONGO_PORT"]
transformer_mongo_username = notebook_config["TRANSFORMER_MONGO_USERNAME"]
transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"]

return dict(
origin_mongo_config_file_path=origin_mongo_config_file_path,
transformer_mongo_config_file_path=transformer_mongo_config_file_path,
origin_dump_folder_path=origin_dump_folder_path,
transformer_dump_folder_path=transformer_dump_folder_path,
mongodump_path=mongodump_path,
mongorestore_path=mongorestore_path,
mongosh_path=mongosh_path,
origin_mongo_host=origin_mongo_host,
origin_mongo_port=origin_mongo_port,
origin_mongo_username=origin_mongo_username,
origin_mongo_password=origin_mongo_password,
transformer_mongo_host=transformer_mongo_host,
transformer_mongo_port=transformer_mongo_port,
transformer_mongo_username=transformer_mongo_username,
transformer_mongo_password=transformer_mongo_password,
)

def parse_and_validate_mongo_config_file(
self, mongo_config_file_path: str
) -> Dict[str, str]:
# Parse the Mongo config files as YAML.
with open(mongo_config_file_path, "r") as file:
mongo_config = yaml.safe_load(file)

# Validate the connection string.
uri = mongo_config["uri"]
if not re.match(
r"^mongodb:\/\/.*", uri
): # note: this is a sanity test, not a comprehensive test
raise ValueError(f"uri value in {mongo_config_file_path} is invalid.")

return dict(uri=uri)

def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None:
# Parse and validate the notebook config file.
notebook_config = self.parse_and_validate_notebook_config_file(
notebook_config_file_path
)
notebook_config = self.parse_and_validate_notebook_config_file(notebook_config_file_path)
self.mongodump_path = notebook_config["mongodump_path"]
self.mongorestore_path = notebook_config["mongorestore_path"]
self.mongosh_path = notebook_config["mongosh_path"]
self.origin_dump_folder_path = notebook_config["origin_dump_folder_path"]
self.transformer_dump_folder_path = notebook_config[
"transformer_dump_folder_path"
]

# Parse and validate the Mongo config files.
self.origin_mongo_config_file_path = notebook_config[
"origin_mongo_config_file_path"
]
self.transformer_mongo_config_file_path = notebook_config[
"transformer_mongo_config_file_path"
]
origin_mongo_server_config = self.parse_and_validate_mongo_config_file(
self.origin_mongo_config_file_path
)
transformer_mongo_server_config = self.parse_and_validate_mongo_config_file(
self.transformer_mongo_config_file_path
)
self.origin_mongo_server_uri = origin_mongo_server_config["uri"]
self.transformer_mongo_server_uri = transformer_mongo_server_config["uri"]
self.transformer_dump_folder_path = notebook_config["transformer_dump_folder_path"]

# Parse the Mongo connection parameters.
self.origin_mongo_host = notebook_config["origin_mongo_host"]
self.origin_mongo_port = notebook_config["origin_mongo_port"]
self.origin_mongo_username = notebook_config["origin_mongo_username"]
self.origin_mongo_password = notebook_config["origin_mongo_password"]
self.transformer_mongo_host = notebook_config["transformer_mongo_host"]
self.transformer_mongo_port = notebook_config["transformer_mongo_port"]
self.transformer_mongo_username = notebook_config["transformer_mongo_username"]
self.transformer_mongo_password = notebook_config["transformer_mongo_password"]


def setup_logger(
log_file_path: Optional[str] = None,
logger_name: str = "migrator_logger",
log_level: int = logging.DEBUG,
) -> logging.Logger:
r"""
Returns a logger that writes to a file at the specified log file path
(default: "./{YYYYMMDD_HHMM}_migration.log").
"""

# If no log file path was specified, generate one.
if log_file_path is None:
yyyymmdd_hhmm: str = datetime.now().strftime("%Y%m%d_%H%M") # YYYYMMDD_HHMM
log_file_path = f"./{yyyymmdd_hhmm}_migration.log"

logger = logging.getLogger(name=logger_name)
logger.setLevel(level=log_level)
file_handler = logging.FileHandler(log_file_path)
formatter = logging.Formatter(
fmt="[%(asctime)s %(name)s %(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
file_handler.setFormatter(formatter)
if logger.hasHandlers():
logger.handlers.clear() # avoids duplicate log entries
logger.addHandler(file_handler)
return logger


def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]:
"""
Returns the names of the slots of the `Database` class that describe database collections.
:param schema_view: A `SchemaView` instance
"""
collection_names = []

for slot_name in schema_view.class_slots(DATABASE_CLASS_NAME):
slot_definition = schema_view.induced_slot(slot_name, DATABASE_CLASS_NAME)

# Filter out any hypothetical (future) slots that don't correspond to a collection (e.g. `db_version`).
if slot_definition.multivalued and slot_definition.inlined_as_list:
collection_names.append(slot_name)

# Filter out duplicate names. This is to work around the following issues in the schema:
# - https://github.com/microbiomedata/nmdc-schema/issues/1954
# - https://github.com/microbiomedata/nmdc-schema/issues/1955
collection_names = list(set(collection_names))

return collection_names
50 changes: 50 additions & 0 deletions demo/metadata_migration/notebooks/migrate_10_5_6_to_10_8_0.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Migrate MongoDB database from `nmdc-schema` `v10.5.6` to `v10.8.0`",
"id": "d05efc6327778f9c"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "There are no migrators associated with any schema changes between schema versions `v10.5.6` and `v10.8.0`. So, this notebook is a \"no op\" (i.e. \"no operation\").",
"id": "b99d5924e825b9a2"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"jupyter": {
"is_executing": true
}
},
"source": "# no op",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 7955875

Please sign in to comment.