Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrations: Update notebook to perform LinkML validation and allow database name configuration #750

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions demo/metadata_migration/notebooks/.notebook.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ ORIGIN_MONGO_HOST="__REPLACE_ME__"
ORIGIN_MONGO_PORT="__REPLACE_ME__"
ORIGIN_MONGO_USERNAME="__REPLACE_ME__"
ORIGIN_MONGO_PASSWORD="__REPLACE_ME__"
ORIGIN_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc"

# Connection parameters for the Transformer Mongo server (typically a local server).
TRANSFORMER_MONGO_HOST="__REPLACE_ME__"
TRANSFORMER_MONGO_PORT="__REPLACE_ME__"
TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__"
TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__"
TRANSFORMER_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc_transformer"
65 changes: 65 additions & 0 deletions demo/metadata_migration/notebooks/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, Optional, List
import logging
from datetime import datetime
from functools import cache

from dotenv import dotenv_values
from linkml_runtime import SchemaView
Expand Down Expand Up @@ -78,11 +79,27 @@ def parse_and_validate_notebook_config_file(
origin_mongo_port = notebook_config["ORIGIN_MONGO_PORT"]
origin_mongo_username = notebook_config["ORIGIN_MONGO_USERNAME"]
origin_mongo_password = notebook_config["ORIGIN_MONGO_PASSWORD"]
origin_mongo_database_name = notebook_config["ORIGIN_MONGO_DATABASE_NAME"]

transformer_mongo_host = notebook_config["TRANSFORMER_MONGO_HOST"]
transformer_mongo_port = notebook_config["TRANSFORMER_MONGO_PORT"]
transformer_mongo_username = notebook_config["TRANSFORMER_MONGO_USERNAME"]
transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"]
transformer_mongo_database_name = notebook_config["TRANSFORMER_MONGO_DATABASE_NAME"]

# Validate the database names.
if origin_mongo_database_name.strip() == "":
raise ValueError(f"Origin database name cannot be empty")
if transformer_mongo_database_name.strip() == "":
raise ValueError(f"Transformer database name cannot be empty")
if all([
origin_mongo_host == transformer_mongo_host,
origin_mongo_port == transformer_mongo_port,
origin_mongo_database_name == transformer_mongo_database_name,
]):
# Note: We don't allow the use of the origin database as the transformer,
# because that would prevent us from easily aborting the migration.
raise ValueError(f"The origin and transformer cannot both be the same database")

return dict(
origin_dump_folder_path=origin_dump_folder_path,
Expand All @@ -94,10 +111,12 @@ def parse_and_validate_notebook_config_file(
origin_mongo_port=origin_mongo_port,
origin_mongo_username=origin_mongo_username,
origin_mongo_password=origin_mongo_password,
origin_mongo_database_name=origin_mongo_database_name,
transformer_mongo_host=transformer_mongo_host,
transformer_mongo_port=transformer_mongo_port,
transformer_mongo_username=transformer_mongo_username,
transformer_mongo_password=transformer_mongo_password,
transformer_mongo_database_name=transformer_mongo_database_name,
)

def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None:
Expand All @@ -114,10 +133,12 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None:
self.origin_mongo_port = notebook_config["origin_mongo_port"]
self.origin_mongo_username = notebook_config["origin_mongo_username"]
self.origin_mongo_password = notebook_config["origin_mongo_password"]
self.origin_mongo_database_name = notebook_config["origin_mongo_database_name"]
self.transformer_mongo_host = notebook_config["transformer_mongo_host"]
self.transformer_mongo_port = notebook_config["transformer_mongo_port"]
self.transformer_mongo_username = notebook_config["transformer_mongo_username"]
self.transformer_mongo_password = notebook_config["transformer_mongo_password"]
self.transformer_mongo_database_name = notebook_config["transformer_mongo_database_name"]


def setup_logger(
Expand Down Expand Up @@ -154,6 +175,9 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]:
Returns the names of the slots of the `Database` class that describe database collections.

:param schema_view: A `SchemaView` instance

Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
collection_names = []

Expand All @@ -170,3 +194,44 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]:
collection_names = list(set(collection_names))

return collection_names


@cache # memoizes the decorated function
def translate_class_uri_into_schema_class_name(schema_view: SchemaView, class_uri: str) -> Optional[str]:
r"""
Returns the name of the schema class that has the specified value as its `class_uri`.

Example: "nmdc:Biosample" (a `class_uri` value) -> "Biosample" (a class name)

References:
- https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.all_classes
- https://linkml.io/linkml/code/metamodel.html#linkml_runtime.linkml_model.meta.ClassDefinition.class_uri

Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
schema_class_name = None
all_class_definitions_in_schema = schema_view.all_classes()
for class_name, class_definition in all_class_definitions_in_schema.items():
if class_definition.class_uri == class_uri:
schema_class_name = class_definition.name
break
return schema_class_name


def derive_schema_class_name_from_document(schema_view: SchemaView, document: dict) -> Optional[str]:
r"""
Returns the name of the schema class, if any, of which the specified document claims to represent an instance.

This function is written under the assumption that the document has a `type` field whose value is the `class_uri`
belonging to the schema class of which the document represents an instance. Slot definition for such a field:
https://github.com/microbiomedata/berkeley-schema-fy24/blob/fc2d9600/src/schema/basic_slots.yaml#L420-L436

Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
schema_class_name = None
if "type" in document and isinstance(document["type"], str):
class_uri = document["type"]
schema_class_name = translate_class_uri_into_schema_class_name(schema_view, class_uri)
return schema_class_name
Loading
Loading